437 lines
14 KiB
Python
437 lines
14 KiB
Python
import dateutil.parser
|
|
import justext
|
|
import re
|
|
import sys
|
|
import datetime
|
|
|
|
import lxml.etree
|
|
import urllib.parse
|
|
import os.path
|
|
|
|
import importlib
|
|
import sys
|
|
import os
|
|
import inspect
|
|
|
|
|
|
datere = re.compile("\d{1,2}\.\s*\d{1,2}\.\s*[12]\d{3}")
|
|
yearre = re.compile(r"\s\d{4}\s")
|
|
|
|
def urlunparse(parsed_url):
|
|
schema, netloc, path, query = parsed_url
|
|
return urllib.parse.urlunparse((schema, netloc, path, "", query, ""))
|
|
|
|
def normalize_link(link, base=None,strip_query=False):
|
|
|
|
link = link.strip().replace(
|
|
"\n", "").replace("\t", "").replace("\r", "")
|
|
parsed_link = urllib.parse.urlparse(link)
|
|
schema = parsed_link[0]
|
|
netloc = parsed_link[1].strip().lower()
|
|
path = parsed_link[2].strip()
|
|
query = parsed_link[4]
|
|
if strip_query:
|
|
query = ""
|
|
if path is None or len(path) == 0:
|
|
path = "/"
|
|
dirname, filename = os.path.split(path)
|
|
if base is not None:
|
|
parsed_base = urllib.parse.urlparse(base)
|
|
if schema == "":
|
|
schema = parsed_base[0]
|
|
# Ak je relativny link
|
|
if netloc == "":
|
|
netloc = parsed_base[1]
|
|
schema = parsed_base[0]
|
|
bdir, bfile = os.path.split(parsed_base[2])
|
|
if len(bdir) > 0 and bdir[0] != "." and len(dirname) > 0 and dirname[0] != "/":
|
|
dirname = bdir + "/" + dirname
|
|
# if len(dirname) == 0 or dirname[0] != '/':
|
|
# path = '/' + path
|
|
dirname = os.path.normpath(dirname)
|
|
dirname = dirname.lstrip("/").lstrip(".")
|
|
path = dirname + "/" + filename
|
|
return schema, netloc, path, query
|
|
|
|
def get_bs_links(work_link,bs):
|
|
# Extrakcia linkov zo stranky
|
|
base = work_link
|
|
if bs.base is not None and "href" in bs.base.attrs:
|
|
base = bs.base["href"]
|
|
link_set = set()
|
|
# Normalizacia linkov
|
|
for l in bs.find_all("a", href=True):
|
|
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
|
|
continue
|
|
href = l["href"]
|
|
try:
|
|
nl = normalize_link(href, base)
|
|
link = urlunparse(nl)
|
|
if link == base:
|
|
continue
|
|
link_set.add(link)
|
|
except ValueError:
|
|
pass
|
|
return link_set
|
|
|
|
def get_date(te):
|
|
dates = []
|
|
words = []
|
|
if te is None:
|
|
te = ""
|
|
for t in te.split():
|
|
t = t.strip().lower().lstrip("0").replace("\r", "\n").replace("\n", "")
|
|
if len(t) == 0:
|
|
continue
|
|
for i, m in enumerate(["jan", "feb", "mar", "apr", "máj", "jún", "júl", "aug", "sept", "okt", "nov", "dec"]):
|
|
if t.startswith(m):
|
|
t = str(i + 1) + "."
|
|
break
|
|
if t[0].isdigit():
|
|
words.append(t)
|
|
txt = " ".join(words)
|
|
for st in re.findall(datere, txt):
|
|
tokens = st.replace(" ", "").split(".")
|
|
try:
|
|
y = int(tokens[-1])
|
|
if y < 2000 or y > 2020:
|
|
continue
|
|
m = 2
|
|
d = 2
|
|
if len(tokens) > 2:
|
|
m = int(tokens[-2])
|
|
d = int(tokens[-3])
|
|
dates.append(datetime.date(y, m, d))
|
|
except ValueError:
|
|
pass
|
|
return dates
|
|
|
|
class ParsedDocument:
|
|
"""
|
|
One document in the database
|
|
"""
|
|
def __init__(self):
|
|
self.work_link = None
|
|
|
|
self.content = None
|
|
self.bs = None
|
|
|
|
self.link_set = set()
|
|
self.body = None
|
|
self.text_date = None
|
|
self.tags = None
|
|
self.authors = None
|
|
self.title = None
|
|
self.description = None
|
|
self.section = None
|
|
self.article_published_time = None
|
|
self.current_time = datetime.date.today()
|
|
|
|
def __str__(self):
|
|
r = []
|
|
if self.authors is not None:
|
|
r.append(",".join(self.authors))
|
|
if self.title is not None:
|
|
r.append(self.title)
|
|
if self.body is not None:
|
|
if (len(self.body) < 20):
|
|
r.append(self.body)
|
|
else:
|
|
r.append(self.body[0:20] + " ....")
|
|
return ">>> ".join(r)
|
|
|
|
class BaseParser:
|
|
def __init__(self, verbose=False):
|
|
self.strip_query = True
|
|
self.skiptypes = [".gz", ".js", ".avi", ".flv", ".zip", ".xls", ".doc", ".rtf", ".odt", ".mp3", ".mp4", ".wmv", ".jpg", ".png", ".txt", ".pdf", ".css", ".gif", ".tgz",
|
|
".7", ".ogg", "rss", "galeria", "gallery", ".jpeg", ".mpg", ".mpeg", ".xml", ".rar", ".xlsx", ".docx", ".pptx", ".odp", ".iso", ".ppt", ".bz", ".dwg", ".eps", ".bin"]
|
|
self.skipchars = re.compile(r"[();:@& ]")
|
|
self.verbose = verbose
|
|
self.domain_re = re.compile("^((?!-)[A-Za-z0-9-]{1,63}(?<!-)\\.)+[A-Za-z]{2,6}$")
|
|
self.recent_links = 5
|
|
self.old_links = 3
|
|
self.random_links = 10
|
|
self.crawl_rounds = 3
|
|
self.skipdomains = set()
|
|
self.allowdomains = set()
|
|
self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter", "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc", "eshop", "e-shop", "email", "gallery", "flog"])
|
|
|
|
def is_domain_good(self, domain):
|
|
r = None
|
|
# Netloc
|
|
if ":" in domain:
|
|
r = "Port in domain"
|
|
elif len(domain) < 4:
|
|
r = "Too short domain"
|
|
elif len(domain) > 127:
|
|
r = "Too long location"
|
|
elif domain.startswith(".") or domain.endswith("."):
|
|
r = "Malformed domain"
|
|
elif not self.domain_re.match(domain):
|
|
r = "Bad domain"
|
|
else:
|
|
da = False
|
|
for d in self.allowdomains:
|
|
if domain.endswith(d):
|
|
da = True
|
|
break
|
|
if not da and len(self.allowdomains) > 0:
|
|
r = "Domain not in allowlist"
|
|
for d in self.skipdomains:
|
|
if domain.endswith(d):
|
|
r = "In domain skiplist"
|
|
for d in domain.split("."):
|
|
if d in self.skippaths:
|
|
r = "Domain in skippath"
|
|
if r is not None and self.verbose:
|
|
print(domain + " " + r)
|
|
return r is None
|
|
|
|
# # Argument - parsovana url
|
|
def is_link_good(self, link):
|
|
assert(link is not None)
|
|
r = None
|
|
if sys.getsizeof(link) > 1023:
|
|
r = "Too long"
|
|
try:
|
|
schema, domain, path, query = normalize_link(link)
|
|
if not schema.startswith("http"):
|
|
r = "Bad schema"
|
|
dg = self.is_domain_good(domain)
|
|
if not dg:
|
|
return False
|
|
for c in link:
|
|
if ord(c) >= 128:
|
|
r = "Bad link character"
|
|
break
|
|
# Path
|
|
for t in self.skiptypes:
|
|
if path.lower().endswith(t):
|
|
r = "Bad type"
|
|
break
|
|
if re.search(self.skipchars, path):
|
|
r = "Bad path"
|
|
for p in path.split("/"):
|
|
if p in self.skippaths or "jpg" in p or "galeria" in p:
|
|
r = "Bad path"
|
|
break
|
|
except ValueError:
|
|
r = "Bad urlparse"
|
|
return r is None
|
|
|
|
def filter_links(self,links):
|
|
# Filter links
|
|
linkset = set()
|
|
for link in links:
|
|
if not self.is_link_good(link):
|
|
continue
|
|
link = urlunparse(normalize_link(link,strip_query=self.strip_query))
|
|
linkset.add(link)
|
|
|
|
return list(linkset)
|
|
|
|
def full_extract(self,content,bs,work_link):
|
|
pass
|
|
|
|
import trafilatura
|
|
import courlan
|
|
|
|
class TrafilaturaParser(BaseParser):
|
|
def full_extract(self,content,bs,work_link):
|
|
content.seek(0)
|
|
content = content.read()
|
|
res = trafilatura.bare_extraction(content,url=work_link,with_metadata=True,target_language="sk",include_formatting=True)
|
|
print(res)
|
|
pd = ParsedDocument()
|
|
pd.work_link = work_link
|
|
pd.current_time = datetime.date.today()
|
|
# Extract text and metatext
|
|
pd.body = res["text"]
|
|
#pd.text_date
|
|
#pd.tags = res["tags"]
|
|
#pd.authors = res["author"]
|
|
pd.article_published_time = res["date"]
|
|
#pd.section = res["categories"]
|
|
pd.link_set = get_bs_links(work_link,bs)
|
|
return pd
|
|
|
|
class SoupParser(BaseParser):
|
|
def __init__(self, verbose=False):
|
|
BaseParser.__init__(self,verbose)
|
|
self.justext_language = "Slovak"
|
|
|
|
def full_extract(self,content,bs,work_link):
|
|
"""
|
|
Parse content and fill the object
|
|
"""
|
|
pd = ParsedDocument()
|
|
pd.work_link = work_link
|
|
|
|
pd.current_time = datetime.date.today()
|
|
# Extract text and metatext
|
|
pd.body, pd.text_date = self.extract_raw_text(content, pd.current_time)
|
|
pd.tags,pd.authors,pd.title,pd.article_published_time, pd.description,pd.section = self.extract_meta(bs)
|
|
pd.link_set = get_bs_links(work_link,bs)
|
|
return pd
|
|
|
|
|
|
|
|
def extract_raw_text(self, content, current_time):
|
|
result = []
|
|
rd = None
|
|
paragraphs = []
|
|
content.seek(0)
|
|
try:
|
|
c = content.read()
|
|
paragraphs = justext.justext(c, justext.get_stoplist(self.justext_language), length_low=50, length_high=150)
|
|
content.seek(0)
|
|
except lxml.etree.XMLSyntaxError:
|
|
print("XML Syntax parse error")
|
|
except lxml.etree.ParserError:
|
|
print("XML Parse parse error")
|
|
except justext.core.JustextError:
|
|
print("Justext error")
|
|
except IndexError:
|
|
print("XML error")
|
|
except UnicodeDecodeError:
|
|
print("Unicode Error")
|
|
except TypeError:
|
|
# NUll in string
|
|
print("String Error")
|
|
except RuntimeError:
|
|
# Maximum recursion depth"
|
|
print("Recursion Error")
|
|
dates = []
|
|
for p in paragraphs:
|
|
# TODO - match URL for date
|
|
if p is not None and p.text is not None and len(p.text) > 0:
|
|
dat = get_date(p.text)
|
|
for d in dat:
|
|
dates.append(d)
|
|
if self.verbose:
|
|
print(p.class_type, p.links_density(), p.stopwords_density(
|
|
justext.get_stoplist(self.justext_language)), p.text)
|
|
if not p.is_boilerplate:
|
|
result.append(p.text.strip())
|
|
if len(dates) == 0:
|
|
dates.append(current_time)
|
|
if len(dates) > 0:
|
|
rd = max(dates)
|
|
rd = rd.isoformat()
|
|
|
|
return "\n\n".join(result), rd
|
|
|
|
# Extracts matainformation from html
|
|
# First it looks for name, content in meta tags
|
|
# then it looks for opengraph
|
|
def extract_meta(self, bs):
|
|
tags = set()
|
|
authors = set()
|
|
title = ""
|
|
description = ""
|
|
section = ""
|
|
article_published_time = ""
|
|
|
|
for m in bs.find_all("meta", attrs={"name": True, "content": True}):
|
|
content = m["content"].strip()
|
|
if len(content) == 0:
|
|
continue
|
|
name = m["name"].strip()
|
|
if name == "keywords":
|
|
for t in content.split(","):
|
|
if len(t.strip()) > 0:
|
|
tags.add(t.strip())
|
|
if name == "news_keywords":
|
|
for t in content.split(","):
|
|
if len(t.strip()) > 0:
|
|
tags.add(t.strip())
|
|
if name == "author":
|
|
authors.add(content)
|
|
if name == "description":
|
|
description = content
|
|
|
|
for m in bs.find_all("meta", property=True, content=True):
|
|
content = m["content"].strip()
|
|
if len(content) == 0:
|
|
continue
|
|
property = m["property"].strip()
|
|
if property == "og:title":
|
|
title = content
|
|
if property == "article:published_time":
|
|
try:
|
|
# Je v ISO formate?
|
|
d = dateutil.parser.parse(content)
|
|
article_published_time = d.isoformat()
|
|
except ValueError:
|
|
pass
|
|
except OverflowError:
|
|
pass
|
|
if property == "article:author" and "admin" not in content.lower():
|
|
authors.add(content)
|
|
if property == "section":
|
|
section = content
|
|
if property == "tag":
|
|
tags.add(content)
|
|
if property == "og:description":
|
|
description = content
|
|
|
|
if len(title) < 2 and bs.h1 is not None:
|
|
title = bs.h1.get_text(strip=True)
|
|
if len(title) < 2 and bs.title is not None:
|
|
title = bs.title.get_text(strip=True)
|
|
if len(authors) == 0:
|
|
for m in bs.find_all(property="author"):
|
|
authors.add(m.get_text(strip=True))
|
|
if len(authors) == 0:
|
|
for m in bs.find_all(itemprop="author"):
|
|
authors.add(m.get_text(strip=True))
|
|
authors = set(filter(lambda x: len(x) > 2, authors))
|
|
|
|
return tags,authors,title.strip(),article_published_time.strip(),description,section.strip()
|
|
|
|
|
|
|
|
class EnglishParser(SoupParser):
|
|
def __init__(self):
|
|
super(EnglishParser,self).__init__()
|
|
self.justext_language = "English"
|
|
self.allowdomains = set(["com","org","io"])
|
|
|
|
# https://github.com/scrapy/scrapy/blob/master/scrapy/commands/runspider.py
|
|
def _import_file(filepath):
|
|
abspath = os.path.abspath(filepath)
|
|
dirname, file = os.path.split(abspath)
|
|
fname, fext = os.path.splitext(file)
|
|
if fext != '.py':
|
|
raise ValueError("Not a Python source file: %s" % abspath)
|
|
if dirname:
|
|
sys.path = [dirname] + sys.path
|
|
try:
|
|
module = importlib.import_module(fname)
|
|
finally:
|
|
if dirname:
|
|
sys.path.pop(0)
|
|
return module
|
|
|
|
def iter_parser(module):
|
|
"""Return an iterator over all spider classes defined in the given module
|
|
that can be instantiated (i.e. which have name)
|
|
"""
|
|
# this needs to be imported here until get rid of the spider manager
|
|
# singleton in scrapy.spider.spiders
|
|
for obj in vars(module).values():
|
|
|
|
if inspect.isclass(obj) and \
|
|
obj.__module__ == module.__name__ and \
|
|
issubclass(obj, BaseParser):
|
|
yield obj
|
|
|
|
def load_parser(file_name):
|
|
pmodule = _import_file(file_name)
|
|
parsers = [m for m in iter_parser(pmodule)]
|
|
p = None
|
|
if len(parsers)> 0:
|
|
pc = parsers[-1]
|
|
p = pc()
|
|
return p
|