websucker-pip/websucker/parser.py

import dateutil.parser
import justext
import re
import sys
import datetime

import lxml.etree
import urllib.parse
import os.path

import importlib
import sys
import os
import inspect


datere = re.compile("\d{1,2}\.\s*\d{1,2}\.\s*[12]\d{3}")
yearre = re.compile(r"\s\d{4}\s")

def urlunparse(parsed_url):
    schema, netloc, path, query = parsed_url
    return urllib.parse.urlunparse((schema, netloc, path, "", query, ""))

def normalize_link(link, base=None,strip_query=False):

    link = link.strip().replace(
                "\n", "").replace("\t", "").replace("\r", "")
    parsed_link = urllib.parse.urlparse(link)
    schema = parsed_link[0]
    netloc = parsed_link[1].strip().lower()
    path = parsed_link[2].strip()
    query = parsed_link[4]
    if strip_query:
        query = ""
    if path is None or len(path) == 0:
        path = "/"
    dirname, filename = os.path.split(path)
    if base is not None:
        parsed_base = urllib.parse.urlparse(base)
        if schema == "":
            schema = parsed_base[0]
        # Ak je relativny link
        if netloc == "":
            netloc = parsed_base[1]
            schema = parsed_base[0]
            bdir, bfile = os.path.split(parsed_base[2])
            if len(bdir) > 0 and bdir[0] != "." and len(dirname) > 0 and dirname[0] != "/":
                dirname = bdir + "/" + dirname
    # if len(dirname) == 0 or dirname[0] != '/':
    #    path = '/' + path
    dirname = os.path.normpath(dirname)
    dirname = dirname.lstrip("/").lstrip(".")
    path = dirname + "/" + filename
    return schema, netloc, path, query

def get_bs_links(work_link,bs):
    # Extrakcia linkov zo stranky
    base = work_link
    if bs.base is not None and "href" in bs.base.attrs:
        base = bs.base["href"]
    link_set = set()
    # Normalizacia linkov
    for l in bs.find_all("a", href=True):
        if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
            continue
        href = l["href"]
        try:
            nl = normalize_link(href, base)
            link = urlunparse(nl)
            if link == base:
                continue
            link_set.add(link)
        except ValueError:
            pass
    return link_set

def get_date(te):
    dates = []
    words = []
    if te is None:
        te = ""
    for t in te.split():
        t = t.strip().lower().lstrip("0").replace("\r", "\n").replace("\n", "")
        if len(t) == 0:
            continue
        for i, m in enumerate(["jan", "feb", "mar", "apr", "máj", "jún", "júl", "aug", "sept", "okt", "nov", "dec"]):
            if t.startswith(m):
                t = str(i + 1) + "."
                break
        if t[0].isdigit():
            words.append(t)
    txt = " ".join(words)
    for st in re.findall(datere, txt):
        tokens = st.replace(" ", "").split(".")
        try:
            y = int(tokens[-1])
            if y < 2000 or y > 2020:
                continue
            m = 2
            d = 2
            if len(tokens) > 2:
                m = int(tokens[-2])
                d = int(tokens[-3])
            dates.append(datetime.date(y, m, d))
        except ValueError:
            pass
    return dates

class ParsedDocument:
    """
    One document in the database
    """
    def __init__(self):
        self.work_link = None

        self.content = None
        self.bs = None

        self.link_set = set()
        self.body = None
        self.text_date = None
        self.tags = None
        self.authors = None
        self.title = None
        self.description = None
        self.section = None
        self.article_published_time = None
        self.current_time = datetime.date.today()

    def __str__(self):
        r = []
        if self.authors is not None:
            r.append(",".join(self.authors))
        if self.title is not None:
            r.append(self.title)
        if self.body is not None:
            if (len(self.body) < 20):
                r.append(self.body)
            else:
                r.append(self.body[0:20] + " ....")
        return ">>> ".join(r)

class BaseParser:
    def __init__(self, verbose=False):
        self.strip_query = True
        self.skiptypes = [".gz", ".js", ".avi", ".flv", ".zip", ".xls", ".doc", ".rtf", ".odt", ".mp3", ".mp4", ".wmv", ".jpg", ".png", ".txt", ".pdf", ".css", ".gif", ".tgz",
                          ".7", ".ogg", "rss", "galeria", "gallery", ".jpeg", ".mpg", ".mpeg", ".xml", ".rar", ".xlsx", ".docx", ".pptx", ".odp", ".iso", ".ppt", ".bz", ".dwg", ".eps", ".bin"]
        self.skipchars = re.compile(r"[();:@& ]")
        self.verbose = verbose
        self.domain_re = re.compile("^((?!-)[A-Za-z0-9-]{1,63}(?<!-)\\.)+[A-Za-z]{2,6}$")
        self.recent_links = 5
        self.old_links = 3
        self.random_links = 10
        self.crawl_rounds = 3
        self.skipdomains = set()
        self.allowdomains = set()
        self.skippaths = set(["sort", "search", "sortby" "gallery" "images", "pictures" "order", "add", "insert", "cart", "order", "filter",   "ssid", "session", "print", "rss", "feed", "login", "register", "delivery","form", "file", "img", "shopby", "foto", "orderby", "desc",  "eshop", "e-shop", "email", "gallery", "flog"])

    def is_domain_good(self, domain):
        r = None
        # Netloc
        if ":" in domain:
            r = "Port in domain"
        elif len(domain) < 4:
            r = "Too short domain"
        elif len(domain) > 127:
            r = "Too long location"
        elif domain.startswith(".") or domain.endswith("."):
            r = "Malformed domain"
        elif not self.domain_re.match(domain):
            r = "Bad domain"
        else:
            da = False
            for d in self.allowdomains:
                if domain.endswith(d):
                    da = True
                    break
            if not da and len(self.allowdomains) > 0:
                r = "Domain not in allowlist"
            for d in self.skipdomains:
                if domain.endswith(d):
                    r = "In domain skiplist"
            for d in domain.split("."):
                if d in self.skippaths:
                    r = "Domain in skippath"
        if r is not None and self.verbose:
            print(domain + " " + r)
        return r is None

#   # Argument - parsovana url
    def is_link_good(self, link):
        assert(link is not None)
        r = None
        if sys.getsizeof(link) > 1023:
            r = "Too long"
        try:
            schema, domain, path, query = normalize_link(link)
            if not schema.startswith("http"):
                r = "Bad schema"
            dg = self.is_domain_good(domain)
            if not dg:
                return False
            for c in link:
                if ord(c) >= 128:
                    r = "Bad link character"
                    break
            # Path
            for t in self.skiptypes:
                if path.lower().endswith(t):
                    r = "Bad type"
                    break
            if re.search(self.skipchars, path):
                r = "Bad path"
            for p in path.split("/"):
                if p in self.skippaths or "jpg" in p or "galeria" in p:
                    r = "Bad path"
                    break
        except ValueError:
            r = "Bad urlparse"
        return r is None

    def filter_links(self,links):
        # Filter links
        linkset = set()
        for link in links:
            if not self.is_link_good(link):
                continue
            link = urlunparse(normalize_link(link,strip_query=self.strip_query))
            linkset.add(link)

        return list(linkset)

    def full_extract(self,content,bs,work_link):
        pass

import trafilatura
import courlan

class TrafilaturaParser(BaseParser):
    def full_extract(self,content,bs,work_link):
        content.seek(0)
        content = content.read()
        res = trafilatura.bare_extraction(content,url=work_link,with_metadata=True,target_language="sk",include_formatting=True)
        print(res)
        pd = ParsedDocument()
        pd.work_link = work_link
        pd.current_time = datetime.date.today()
        # Extract text and metatext
        pd.body = res["text"]
        #pd.text_date
        #pd.tags = res["tags"]
        #pd.authors = res["author"]
        pd.article_published_time = res["date"]
        #pd.section = res["categories"]
        pd.link_set = get_bs_links(work_link,bs)
        return pd

class SoupParser(BaseParser):
    def __init__(self, verbose=False):
        BaseParser.__init__(self,verbose)
        self.justext_language = "Slovak"

    def full_extract(self,content,bs,work_link):
        """
        Parse content and fill the object
        """
        pd = ParsedDocument()
        pd.work_link = work_link

        pd.current_time = datetime.date.today()
        # Extract text and metatext
        pd.body, pd.text_date = self.extract_raw_text(content, pd.current_time)
        pd.tags,pd.authors,pd.title,pd.article_published_time, pd.description,pd.section = self.extract_meta(bs)
        pd.link_set = get_bs_links(work_link,bs)
        return pd


    def extract_raw_text(self, content, current_time):
        result = []
        rd = None
        paragraphs = []
        content.seek(0)
        try:
            c = content.read()
            paragraphs = justext.justext(c, justext.get_stoplist(self.justext_language), length_low=50, length_high=150)
            content.seek(0)
        except lxml.etree.XMLSyntaxError:
            print("XML Syntax parse error")
        except lxml.etree.ParserError:
            print("XML Parse parse error")
        except justext.core.JustextError:
            print("Justext error")
        except IndexError:
            print("XML error")
        except UnicodeDecodeError:
            print("Unicode Error")
        except TypeError:
            # NUll in string
            print("String Error")
        except RuntimeError:
            # Maximum recursion depth"
            print("Recursion Error")
        dates = []
        for p in paragraphs:
            # TODO - match URL for date
            if p is not None and p.text is not None and len(p.text) > 0:
                dat = get_date(p.text)
                for d in dat:
                    dates.append(d)
                if self.verbose:
                    print(p.class_type, p.links_density(), p.stopwords_density(
                        justext.get_stoplist(self.justext_language)), p.text)
                if not p.is_boilerplate:
                    result.append(p.text.strip())
        if len(dates) == 0:
            dates.append(current_time)
        if len(dates) > 0:
            rd = max(dates)
        rd = rd.isoformat()

        return "\n\n".join(result), rd

    # Extracts matainformation from html
    # First it looks for name, content in meta tags
    # then it looks for opengraph
    def extract_meta(self, bs):
        tags = set()
        authors = set()
        title = ""
        description = ""
        section = ""
        article_published_time = ""

        for m in bs.find_all("meta", attrs={"name": True, "content": True}):
            content = m["content"].strip()
            if len(content) == 0:
                continue
            name = m["name"].strip()
            if name == "keywords":
                for t in content.split(","):
                    if len(t.strip()) > 0:
                        tags.add(t.strip())
            if name == "news_keywords":
                for t in content.split(","):
                    if len(t.strip()) > 0:
                        tags.add(t.strip())
            if name == "author":
                authors.add(content)
            if name == "description":
                description = content

        for m in bs.find_all("meta", property=True, content=True):
            content = m["content"].strip()
            if len(content) == 0:
                continue
            property = m["property"].strip()
            if property == "og:title":
                title = content
            if property == "article:published_time":
                try:
                    # Je v ISO formate?
                    d = dateutil.parser.parse(content)
                    article_published_time = d.isoformat()
                except ValueError:
                    pass
                except OverflowError:
                    pass
            if property == "article:author" and "admin" not in content.lower():
                authors.add(content)
            if property == "section":
                section = content
            if property == "tag":
                tags.add(content)
            if property == "og:description":
                description = content

        if len(title) < 2 and bs.h1 is not None:
            title = bs.h1.get_text(strip=True)
        if len(title) < 2 and bs.title is not None:
            title = bs.title.get_text(strip=True)
        if len(authors) == 0:
            for m in bs.find_all(property="author"):
                authors.add(m.get_text(strip=True))
        if len(authors) == 0:
            for m in bs.find_all(itemprop="author"):
                authors.add(m.get_text(strip=True))
        authors = set(filter(lambda x: len(x) > 2, authors))

        return tags,authors,title.strip(),article_published_time.strip(),description,section.strip()


class EnglishParser(SoupParser):
    def __init__(self):
        super(EnglishParser,self).__init__()
        self.justext_language = "English"
        self.allowdomains = set(["com","org","io"])

# https://github.com/scrapy/scrapy/blob/master/scrapy/commands/runspider.py
def _import_file(filepath):
    abspath = os.path.abspath(filepath)
    dirname, file = os.path.split(abspath)
    fname, fext = os.path.splitext(file)
    if fext != '.py':
        raise ValueError("Not a Python source file: %s" % abspath)
    if dirname:
        sys.path = [dirname] + sys.path
    try:
        module = importlib.import_module(fname)
    finally:
        if dirname:
            sys.path.pop(0)
    return module

def iter_parser(module):
    """Return an iterator over all spider classes defined in the given module
    that can be instantiated (i.e. which have name)
    """
    # this needs to be imported here until get rid of the spider manager
    # singleton in scrapy.spider.spiders
    for obj in vars(module).values():

        if inspect.isclass(obj) and \
           obj.__module__ == module.__name__ and \
           issubclass(obj, BaseParser):
           yield obj

def load_parser(file_name):
    pmodule = _import_file(file_name)
    parsers = [m for m in iter_parser(pmodule)]
    p = None
    if len(parsers)> 0:
        pc = parsers[-1]
        p = pc()
    return p