import dateutil.parser import justext import re import sys import datetime import lxml.etree import urllib.parse import os.path import importlib import sys import os import inspect datere = re.compile("\d{1,2}\.\s*\d{1,2}\.\s*[12]\d{3}") yearre = re.compile(r"\s\d{4}\s") def urlunparse(parsed_url): schema, netloc, path, query = parsed_url return urllib.parse.urlunparse((schema, netloc, path, "", query, "")) def normalize_link(link, base=None,strip_query=False): link = link.strip().replace( "\n", "").replace("\t", "").replace("\r", "") parsed_link = urllib.parse.urlparse(link) schema = parsed_link[0] netloc = parsed_link[1].strip().lower() path = parsed_link[2].strip() query = parsed_link[4] if strip_query: query = "" if path is None or len(path) == 0: path = "/" dirname, filename = os.path.split(path) if base is not None: parsed_base = urllib.parse.urlparse(base) if schema == "": schema = parsed_base[0] # Ak je relativny link if netloc == "": netloc = parsed_base[1] schema = parsed_base[0] bdir, bfile = os.path.split(parsed_base[2]) if len(bdir) > 0 and bdir[0] != "." and len(dirname) > 0 and dirname[0] != "/": dirname = bdir + "/" + dirname # if len(dirname) == 0 or dirname[0] != '/': # path = '/' + path dirname = os.path.normpath(dirname) dirname = dirname.lstrip("/").lstrip(".") path = dirname + "/" + filename return schema, netloc, path, query def get_bs_links(work_link,bs): # Extrakcia linkov zo stranky base = work_link if bs.base is not None and "href" in bs.base.attrs: base = bs.base["href"] link_set = set() # Normalizacia linkov for l in bs.find_all("a", href=True): if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs: continue href = l["href"] try: nl = normalize_link(href, base) link = urlunparse(nl) if link == base: continue link_set.add(link) except ValueError: pass return link_set def get_date(te): dates = [] words = [] if te is None: te = "" for t in te.split(): t = t.strip().lower().lstrip("0").replace("\r", "\n").replace("\n", "") if len(t) == 0: continue for i, m in enumerate(["jan", "feb", "mar", "apr", "máj", "jún", "júl", "aug", "sept", "okt", "nov", "dec"]): if t.startswith(m): t = str(i + 1) + "." break if t[0].isdigit(): words.append(t) txt = " ".join(words) for st in re.findall(datere, txt): tokens = st.replace(" ", "").split(".") try: y = int(tokens[-1]) if y < 2000 or y > 2020: continue m = 2 d = 2 if len(tokens) > 2: m = int(tokens[-2]) d = int(tokens[-3]) dates.append(datetime.date(y, m, d)) except ValueError: pass return dates class ParsedDocument: """ One document in the database """ def __init__(self): self.work_link = None self.content = None self.bs = None self.link_set = set() self.body = None self.text_date = None self.tags = None self.authors = None self.title = None self.description = None self.section = None self.article_published_time = None self.current_time = datetime.date.today() def __str__(self): r = [] if self.authors is not None: r.append(",".join(self.authors)) if self.title is not None: r.append(self.title) if self.body is not None: if (len(self.body) < 20): r.append(self.body) else: r.append(self.body[0:20] + " ....") return ">>> ".join(r) class BaseParser: def __init__(self, verbose=False): self.strip_query = True self.skiptypes = [".gz", ".js", ".avi", ".flv", ".zip", ".xls", ".doc", ".rtf", ".odt", ".mp3", ".mp4", ".wmv", ".jpg", ".png", ".txt", ".pdf", ".css", ".gif", ".tgz", ".7", ".ogg", "rss", "galeria", "gallery", ".jpeg", ".mpg", ".mpeg", ".xml", ".rar", ".xlsx", ".docx", ".pptx", ".odp", ".iso", ".ppt", ".bz", ".dwg", ".eps", ".bin"] self.skipchars = re.compile(r"[();:@& ]") self.verbose = verbose self.domain_re = re.compile("^((?!-)[A-Za-z0-9-]{1,63}(? 127: r = "Too long location" elif domain.startswith(".") or domain.endswith("."): r = "Malformed domain" elif not self.domain_re.match(domain): r = "Bad domain" else: da = False for d in self.allowdomains: if domain.endswith(d): da = True break if not da and len(self.allowdomains) > 0: r = "Domain not in allowlist" for d in self.skipdomains: if domain.endswith(d): r = "In domain skiplist" for d in domain.split("."): if d in self.skippaths: r = "Domain in skippath" if r is not None and self.verbose: print(domain + " " + r) return r is None # # Argument - parsovana url def is_link_good(self, link): assert(link is not None) r = None if sys.getsizeof(link) > 1023: r = "Too long" try: schema, domain, path, query = normalize_link(link) if not schema.startswith("http"): r = "Bad schema" dg = self.is_domain_good(domain) if not dg: return False for c in link: if ord(c) >= 128: r = "Bad link character" break # Path for t in self.skiptypes: if path.lower().endswith(t): r = "Bad type" break if re.search(self.skipchars, path): r = "Bad path" for p in path.split("/"): if p in self.skippaths or "jpg" in p or "galeria" in p: r = "Bad path" break except ValueError: r = "Bad urlparse" return r is None def filter_links(self,links): # Filter links linkset = set() for link in links: if not self.is_link_good(link): continue link = urlunparse(normalize_link(link,strip_query=self.strip_query)) linkset.add(link) return list(linkset) def full_extract(self,content,bs,work_link): pass import trafilatura import courlan class TrafilaturaParser(BaseParser): def full_extract(self,content,bs,work_link): content.seek(0) content = content.read() res = trafilatura.bare_extraction(content,url=work_link,with_metadata=True,target_language="sk",include_formatting=True) print(res) pd = ParsedDocument() pd.work_link = work_link pd.current_time = datetime.date.today() # Extract text and metatext pd.body = res["text"] #pd.text_date #pd.tags = res["tags"] #pd.authors = res["author"] pd.article_published_time = res["date"] #pd.section = res["categories"] pd.link_set = get_bs_links(work_link,bs) return pd class SoupParser(BaseParser): def __init__(self, verbose=False): BaseParser.__init__(self,verbose) self.justext_language = "Slovak" def full_extract(self,content,bs,work_link): """ Parse content and fill the object """ pd = ParsedDocument() pd.work_link = work_link pd.current_time = datetime.date.today() # Extract text and metatext pd.body, pd.text_date = self.extract_raw_text(content, pd.current_time) pd.tags,pd.authors,pd.title,pd.article_published_time, pd.description,pd.section = self.extract_meta(bs) pd.link_set = get_bs_links(work_link,bs) return pd def extract_raw_text(self, content, current_time): result = [] rd = None paragraphs = [] content.seek(0) try: c = content.read() paragraphs = justext.justext(c, justext.get_stoplist(self.justext_language), length_low=50, length_high=150) content.seek(0) except lxml.etree.XMLSyntaxError: print("XML Syntax parse error") except lxml.etree.ParserError: print("XML Parse parse error") except justext.core.JustextError: print("Justext error") except IndexError: print("XML error") except UnicodeDecodeError: print("Unicode Error") except TypeError: # NUll in string print("String Error") except RuntimeError: # Maximum recursion depth" print("Recursion Error") dates = [] for p in paragraphs: # TODO - match URL for date if p is not None and p.text is not None and len(p.text) > 0: dat = get_date(p.text) for d in dat: dates.append(d) if self.verbose: print(p.class_type, p.links_density(), p.stopwords_density( justext.get_stoplist(self.justext_language)), p.text) if not p.is_boilerplate: result.append(p.text.strip()) if len(dates) == 0: dates.append(current_time) if len(dates) > 0: rd = max(dates) rd = rd.isoformat() return "\n\n".join(result), rd # Extracts matainformation from html # First it looks for name, content in meta tags # then it looks for opengraph def extract_meta(self, bs): tags = set() authors = set() title = "" description = "" section = "" article_published_time = "" for m in bs.find_all("meta", attrs={"name": True, "content": True}): content = m["content"].strip() if len(content) == 0: continue name = m["name"].strip() if name == "keywords": for t in content.split(","): if len(t.strip()) > 0: tags.add(t.strip()) if name == "news_keywords": for t in content.split(","): if len(t.strip()) > 0: tags.add(t.strip()) if name == "author": authors.add(content) if name == "description": description = content for m in bs.find_all("meta", property=True, content=True): content = m["content"].strip() if len(content) == 0: continue property = m["property"].strip() if property == "og:title": title = content if property == "article:published_time": try: # Je v ISO formate? d = dateutil.parser.parse(content) article_published_time = d.isoformat() except ValueError: pass except OverflowError: pass if property == "article:author" and "admin" not in content.lower(): authors.add(content) if property == "section": section = content if property == "tag": tags.add(content) if property == "og:description": description = content if len(title) < 2 and bs.h1 is not None: title = bs.h1.get_text(strip=True) if len(title) < 2 and bs.title is not None: title = bs.title.get_text(strip=True) if len(authors) == 0: for m in bs.find_all(property="author"): authors.add(m.get_text(strip=True)) if len(authors) == 0: for m in bs.find_all(itemprop="author"): authors.add(m.get_text(strip=True)) authors = set(filter(lambda x: len(x) > 2, authors)) return tags,authors,title.strip(),article_published_time.strip(),description,section.strip() class EnglishParser(SoupParser): def __init__(self): super(EnglishParser,self).__init__() self.justext_language = "English" self.allowdomains = set(["com","org","io"]) # https://github.com/scrapy/scrapy/blob/master/scrapy/commands/runspider.py def _import_file(filepath): abspath = os.path.abspath(filepath) dirname, file = os.path.split(abspath) fname, fext = os.path.splitext(file) if fext != '.py': raise ValueError("Not a Python source file: %s" % abspath) if dirname: sys.path = [dirname] + sys.path try: module = importlib.import_module(fname) finally: if dirname: sys.path.pop(0) return module def iter_parser(module): """Return an iterator over all spider classes defined in the given module that can be instantiated (i.e. which have name) """ # this needs to be imported here until get rid of the spider manager # singleton in scrapy.spider.spiders for obj in vars(module).values(): if inspect.isclass(obj) and \ obj.__module__ == module.__name__ and \ issubclass(obj, BaseParser): yield obj def load_parser(file_name): pmodule = _import_file(file_name) parsers = [m for m in iter_parser(pmodule)] p = None if len(parsers)> 0: pc = parsers[-1] p = pc() return p