websucker-pip/websucker/parser.py

378 lines
13 KiB
Python
Raw Normal View History

2020-05-07 14:09:45 +00:00
import dateutil.parser
import justext
import re
import sys
import datetime
import lxml.etree
import urllib.parse
import os.path
2020-05-08 05:53:50 +00:00
import importlib
import sys
import os
import inspect
2020-05-07 14:09:45 +00:00
datere = re.compile("\d{1,2}\.\s*\d{1,2}\.\s*[12]\d{3}")
yearre = re.compile(r"\s\d{4}\s")
def urlunparse(parsed_url):
schema, netloc, path, query = parsed_url
return urllib.parse.urlunparse((schema, netloc, path, "", query, ""))
def normalize_link(link, base=None,strip_query=False):
link = link.strip().replace(
"\n", "").replace("\t", "").replace("\r", "")
parsed_link = urllib.parse.urlparse(link)
schema = parsed_link[0]
netloc = parsed_link[1].strip().lower()
path = parsed_link[2].strip()
query = parsed_link[4]
if strip_query:
query = ""
if path is None or len(path) == 0:
path = "/"
dirname, filename = os.path.split(path)
if base is not None:
parsed_base = urllib.parse.urlparse(base)
if schema == "":
schema = parsed_base[0]
# Ak je relativny link
if netloc == "":
netloc = parsed_base[1]
schema = parsed_base[0]
bdir, bfile = os.path.split(parsed_base[2])
if len(bdir) > 0 and bdir[0] != "." and len(dirname) > 0 and dirname[0] != "/":
dirname = bdir + "/" + dirname
# if len(dirname) == 0 or dirname[0] != '/':
# path = '/' + path
dirname = os.path.normpath(dirname)
dirname = dirname.lstrip("/").lstrip(".")
path = dirname + "/" + filename
return schema, netloc, path, query
def get_date(te):
dates = []
words = []
if te is None:
te = ""
for t in te.split():
t = t.strip().lower().lstrip("0").replace("\r", "\n").replace("\n", "")
if len(t) == 0:
continue
for i, m in enumerate(["jan", "feb", "mar", "apr", "máj", "jún", "júl", "aug", "sept", "okt", "nov", "dec"]):
if t.startswith(m):
t = str(i + 1) + "."
break
if t[0].isdigit():
words.append(t)
txt = " ".join(words)
for st in re.findall(datere, txt):
tokens = st.replace(" ", "").split(".")
try:
y = int(tokens[-1])
if y < 2000 or y > 2020:
continue
m = 2
d = 2
if len(tokens) > 2:
m = int(tokens[-2])
d = int(tokens[-3])
dates.append(datetime.date(y, m, d))
except ValueError:
pass
return dates
class BaseParser:
def __init__(self, verbose=False):
self.strip_query = True
self.skiptypes = [".gz", ".js", ".avi", ".flv", ".zip", ".xls", ".doc", ".rtf", ".odt", ".mp3", ".mp4", ".wmv", ".jpg", ".png", ".txt", ".pdf", ".css", ".gif", ".tgz",
".7", ".ogg", "rss", "galeria", "gallery", ".jpeg", ".mpg", ".mpeg", ".xml", ".rar", ".xlsx", ".docx", ".pptx", ".odp", ".iso", ".ppt", ".bz", ".dwg", ".eps", ".bin"]
self.skipchars = re.compile(r"[();:@& ]")
self.store = True
self.verbose = verbose
self.domain_re = re.compile("^((?!-)[A-Za-z0-9-]{1,63}(?<!-)\\.)+[A-Za-z]{2,6}$")
self.listen_robot = True
self.recent_links = 5
self.old_links = 3
self.random_links = 10
self.crawl_rounds = 3
self.skipdomains = set()
self.allowdomains = set()
self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter", "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc", "eshop", "e-shop", "email", "gallery", "flog"])
self.justext_language = "Slovak"
def is_domain_good(self, domain):
r = None
# Netloc
if ":" in domain:
r = "Port in domain"
elif len(domain) < 4:
r = "Too short domain"
elif len(domain) > 50:
r = "Too long location"
elif domain.startswith(".") or domain.endswith("."):
r = "Malformed domain"
elif not self.domain_re.match(domain):
r = "Bad domain"
else:
da = False
for d in self.allowdomains:
if domain.endswith(d):
da = True
break
if not da and len(self.allowdomains) > 0:
r = "Domain not in allowlist"
for d in self.skipdomains:
if domain.endswith(d):
r = "In domain skiplist"
for d in domain.split("."):
if d in self.skippaths:
r = "Domain in skippath"
if r is not None and self.verbose:
print(domain + " " + r)
return r is None
# # Argument - parsovana url
def is_link_good(self, link):
assert(link is not None)
r = None
if sys.getsizeof(link) > 1023:
r = "Too long"
try:
schema, domain, path, query = normalize_link(link)
if not schema.startswith("http"):
r = "Bad schema"
dg = self.is_domain_good(domain)
if not dg:
return False
for c in link:
if ord(c) >= 128:
r = "Bad domain character"
break
for p in self.skipdomains:
if domain.endswith(p):
r = "Bad domain"
break
if ".b-" in domain:
r = "Bad domain"
if len(domain) > 127:
r = "Too long path"
# Path
for t in self.skiptypes:
if path.lower().endswith(t):
r = "Bad type"
break
if re.search(self.skipchars, path):
r = "Bad path"
for p in path.split("/"):
if p in self.skippaths or "jpg" in p or "galeria" in p:
r = "Bad path"
break
except ValueError:
r = "Bad urlparse"
return r is None
def filter_links(links):
# Filter links
linkset = set()
for link in links:
if not self.is_link_good(link):
continue
link = urlunparse(normalize_link(link,strip_query=self.strip_query))
linkset.add(link)
return list(linkset)
def extract_raw_text(self, content, current_time):
result = []
rd = None
paragraphs = []
content.seek(0)
try:
c = content.read()
paragraphs = justext.justext(c, justext.get_stoplist(self.justext_language), length_low=50, length_high=150)
content.seek(0)
except lxml.etree.XMLSyntaxError:
print("XML Syntax parse error")
except lxml.etree.ParserError:
print("XML Parse parse error")
except justext.core.JustextError:
print("Justext error")
except IndexError:
print("XML error")
except UnicodeDecodeError:
print("Unicode Error")
except TypeError:
# NUll in string
print("String Error")
except RuntimeError:
# Maximum recursion depth"
print("Recursion Error")
dates = []
for p in paragraphs:
# TODO - match URL for date
if p is not None and p.text is not None and len(p.text) > 0:
dat = get_date(p.text)
for d in dat:
dates.append(d)
if self.verbose:
print(p.class_type, p.links_density(), p.stopwords_density(
justext.get_stoplist(self.justext_language)), p.text)
if not p.is_boilerplate:
result.append(p.text.strip())
if len(dates) == 0:
dates.append(current_time)
if len(dates) > 0:
rd = max(dates)
rd = rd.isoformat()
return "\n\n".join(result), rd
# Extracts matainformation from html
# First it looks for name, content in meta tags
# then it looks for opengraph
2020-05-10 09:48:17 +00:00
def extract_meta(self, bs):
2020-05-07 14:09:45 +00:00
tags = set()
authors = set()
title = ""
description = ""
section = ""
article_published_time = ""
for m in bs.find_all("meta", attrs={"name": True, "content": True}):
content = m["content"].strip()
if len(content) == 0:
continue
name = m["name"].strip()
if name == "keywords":
for t in content.split(","):
if len(t.strip()) > 0:
tags.add(t.strip())
if name == "news_keywords":
for t in content.split(","):
if len(t.strip()) > 0:
tags.add(t.strip())
if name == "author":
authors.add(content)
if name == "description":
description = content
for m in bs.find_all("meta", property=True, content=True):
content = m["content"].strip()
if len(content) == 0:
continue
property = m["property"].strip()
if property == "og:title":
title = content
if property == "article:published_time":
try:
# Je v ISO formate?
d = dateutil.parser.parse(content)
article_published_time = d.isoformat()
except ValueError:
pass
except OverflowError:
pass
if property == "article:author" and "admin" not in content.lower():
authors.add(content)
if property == "section":
section = content
if property == "tag":
tags.add(content)
if property == "og:description":
description = content
if len(title) < 2 and bs.h1 is not None:
title = bs.h1.get_text(strip=True)
if len(title) < 2 and bs.title is not None:
title = bs.title.get_text(strip=True)
if len(authors) == 0:
for m in bs.find_all(property="author"):
authors.add(m.get_text(strip=True))
if len(authors) == 0:
for m in bs.find_all(itemprop="author"):
authors.add(m.get_text(strip=True))
authors = set(filter(lambda x: len(x) > 2, authors))
return tags,authors,title.strip(),article_published_time.strip(),description,section.strip()
def calculate_checksums(self, text):
"""
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
"""
checksums = []
sizes = []
hval = 0
hsz = 0
sz = 0
for c in text:
cv = ord(c)
sz += 1
if cv > 64:
hval += (hval << 3) + cv
zv = hval >> 31
hval &= 0x7fffffff
hval += zv
hsz += 1
if c == "\n" and hsz > 0:
if hsz > 100:
checksums.append(hval)
sizes.append(sz)
sz = 0
hsz = 0
if hsz > 100:
checksums.append(hval)
sizes.append(sz)
return checksums, sizes
class EnglishParser(BaseParser):
def __init__(self):
super(EnglishParser,self).__init__()
self.justext_language = "English"
self.allowdomains = set(["com","org","io"])
2020-05-08 05:53:50 +00:00
# https://github.com/scrapy/scrapy/blob/master/scrapy/commands/runspider.py
def _import_file(filepath):
abspath = os.path.abspath(filepath)
dirname, file = os.path.split(abspath)
fname, fext = os.path.splitext(file)
if fext != '.py':
raise ValueError("Not a Python source file: %s" % abspath)
if dirname:
sys.path = [dirname] + sys.path
try:
module = importlib.import_module(fname)
finally:
if dirname:
sys.path.pop(0)
return module
def iter_parser(module):
"""Return an iterator over all spider classes defined in the given module
that can be instantiated (i.e. which have name)
"""
# this needs to be imported here until get rid of the spider manager
# singleton in scrapy.spider.spiders
for obj in vars(module).values():
if inspect.isclass(obj) and \
obj.__module__ == module.__name__ and \
issubclass(obj, BaseParser):
yield obj
def load_parser(file_name):
pmodule = _import_file(file_name)
parsers = [m for m in iter_parser(pmodule)]
p = None
if len(parsers)> 0:
pc = parsers[-1]
p = pc()
return p