websucker-pip/websucker/agent.py

#!/usr/bin/env python
#! -*- coding: utf-8 -*-
import urllib.parse
import urllib.error
import os
import os.path
import re
import datetime
import time
import sys
import tempfile
import pprint
import bs4

import pycurl
import urllib.robotparser
import collections
import random


from websucker.parser import normalize_link,urlunparse


# Parses http refresh in header or on html meta
def get_refresh(ref,target_link):
    refresh = None
    tokens = ref.strip().split(";")
    if len(tokens) > 1 and tokens[1].lower().startswith("url="):
        refresh = urlunparse(normalize_link(
            tokens[1][4:].strip("\'"), target_link))
    return refresh

class Response:
    def __init__(self,url,headers,status,content,redirects,link_status):
        assert len(url) > 0
        assert url[0] != "/"
        self.url = url
        self.status = status
        self.content = content
        self.headers = headers
        self.redirects = redirects
        self.visited_time = datetime.date.today()
        self.bs = None
        if content is not None and link_status == "good":
            try:
                self.bs = bs4.BeautifulSoup(content, "lxml")
            except ValueError:
                link_status = "bad_parse"
        self.link_status = link_status

    def __str__(self):
        return "{} {} {}".format(self.url,self.get_canonical(),self.link_status)

    def get_content(self):
        if self.content is None:
            print("NO CONTENT")
            print(self.url,self.redirects)
            return None
        self.content.seek(0)
        text = self.content.read()
        out = str(text,encoding="utf8",errors="replace")
        return out


    # HMTL metarefresh redirect
    def get_metarefresh(self):
        if self.bs is None:
            return None
        metarefresh = None
        t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"})
        canonical = self.get_canonical()
        for tags in t:
            if "content" in tags:
                metarefresh = get_refresh(tags["content"],canonical)
        if metarefresh is not None:
            nl = normalize_link(metarefresh, canonical)
            print("Metarefresh")
            print(nl)
            metarefresh = urlunparse(nl)

        return metarefresh

    def get_canonical(self):
        r = None
        last_link = self.url
        if len(self.redirects) > 0:
            last_link = self.redirects[-1]
        if self.bs is not None:
            l = self.bs.find("link", rel="canonical", href=True)
            if l is not None:
                r = urlunparse(normalize_link(l["href"], last_link))
        if r is None:
            r = last_link
        r = urlunparse(normalize_link(r, last_link))
        assert len(r) > 0
        assert r[0] != "/"
        return r

    def get_redirects(self):
        if len(self.redirects) <2 :
            return []
        return self.redirects[0:-1]


class Connection:
    def __init__(self):
        self.useragent = "Googlebot-News"
        self.c = pycurl.Curl()
        self.c.setopt(self.c.FOLLOWLOCATION, True)
#       self.c.setopt(self.c.VERBOSE, True)
        self.c.setopt(self.c.CONNECTTIMEOUT, 20)
        self.c.setopt(self.c.TIMEOUT, 20)
        self.c.setopt(self.c.FAILONERROR, True)
        self.c.setopt(self.c.HTTPHEADER, [
                      'Accept: text/html', 'Accept-Charset: UTF-8'])
        self.c.setopt(self.c.HEADERFUNCTION, self.header)
        self.c.setopt(self.c.USERAGENT,self.useragent )
#        #self.c.setopt(pycurl.COOKIEJAR, 'cookie.txt')
#        #self.c.setopt(pycurl.COOKIEFILE, 'cookie.txt')
        self.robots = {}
        self.headers = {}
        self.redirects = []
        self.header_lines = []
        self.status = 0
        self.max_redirect = 4

    # Zastavi spracovanie ak content nie je text
    # zaznamena location a refresh
    def header(self, data):
        if len(data) == 0:
            return None
        l = str(data, encoding="utf8")
        self.header_lines.append(l)
        s = l.find(" ")
        if s >= 1 and s < len(l):
            key = l[0:s - 1]
            value = l[s + 1:].rstrip()
            self.headers[key] = value
            kl = key.lower()
            if kl == "refresh":
                self.add_redirect(value)
            elif kl == "location":
                self.add_redirect(value)
            elif kl == "content-type" and "text" not in value:
                # Pycurl potom vyhodi 23, failed writing header
                return 0

    def crawl_delay(self,domain):
        self.cache_robot(domain)
        delay = 4
        if domain in self.robots:
            r = self.robots[domain]
            if r is not None:
                d = r.crawl_delay(self.useragent)
                if d is not None:
                    delay = d
        print("Waiting for {} s".format(delay))
        time.sleep(delay)

    def __del__(self):
        self.c.close()

    def close(self):
        self.c.close()
    
    def add_redirect(self,link):
        last_link = self.url
        if len(self.redirects) > 0:
            last_link = self.redirects[-1]
        v = urlunparse(normalize_link(link, last_link))
        if v!=last_link and v not in set(self.redirects):
            self.redirects.append(v)

    """
    @returns content, link_status 
    @throws pycurl.error
    """
    def _download(self, url):
        print("Downloading " + url)
        self.url = url
        self.headers = {}
        self.redirects = []
        self.header_lines = []
        self.status = 0
        content = None
        link_status = "bad_connection"
        try:
            self.headers = {}
            del self.header_lines[:]
            content = tempfile.SpooledTemporaryFile()
            self.c.setopt(self.c.WRITEDATA, content)
            self.c.setopt(self.c.URL, url)
            self.c.perform()
            self.status = self.c.getinfo(self.c.RESPONSE_CODE)
            if self.status != 200:
                link_status = "bad_httpcode"
            elif "Content-Type" in self.headers and not self.headers["Content-Type"].startswith("text"):
                link_status = "bad_type"
            else:
                link_status = "good"
                content.seek(0)
        except pycurl.error as e:
            errno, message = e.args
            content = None
            self.status = self.c.getinfo(self.c.RESPONSE_CODE)
            if errno == 23:
                # 23 je zly content v header
                link_status = "bad_type"
            elif errno == 22:
                link_status = "bad_httpcode"
            elif errno == 28:
                # 28 je connection timeout
                link_status = "bad_connection"
            elif errno == 60:
                # 60 bad ssl certificate
                link_status = "bad_connection"
            elif errno == 56:
                # 56 Connection reset by peer
                link_status = "bad_connection"
            elif errno == 16:
                # 16 HTTP2
                link_status = "bad_connection"
            elif errno == 92:
                # 92 HTTP2 not closed
                link_status = "bad_connection"
            elif errno == 6:
                # 60 Unable to resolve dns
                link_status = "bad_connection"
            elif errno == 7:
                # 7 Connection refused
                link_status = "bad_connection"
            else:
                link_status = "bad_connection"
                #raise e
        except UnicodeDecodeError as e:
            content = None
            link_status = "bad_unicode"
        except UnicodeEncodeError as e:
            content = None
            link_status = "bad_unicode"
        sz = self.c.getinfo(self.c.SIZE_DOWNLOAD)
        tt = self.c.getinfo(self.c.TOTAL_TIME)
        print("{} Received {} bytes in {} s".format(self.status,sz,tt))
        return content, link_status

    # Throws pycurl.error
    def html_download2(self, url):
        dlink = url
        responses = []
        while len(responses) < 5:
            nl = normalize_link(dlink)
            url = urlunparse(nl)
            assert url.startswith("http")
            content, link_status = self._download(url)
            response = Response(url,"\r\n".join(self.header_lines),self.status,content,self.redirects,link_status)
            dlink = response.get_metarefresh()
            responses.append(response)
            if dlink is None:
                break
        return responses

    def cache_robot(self,domain):
        if domain not in self.robots:
            roboturl = urlunparse(("https", domain, "robots.txt", ""))
            try:
                r = self._download(roboturl)
                if r[1] == "good":
                    c = r[0].read()
                    lines = str(c, errors="ignore", encoding="utf8").split("\n")
                    self.robots[domain] = urllib.robotparser.RobotFileParser()
                    self.robots[domain].parse(lines)
                else:
                    self.robots[domain] = None
            except pycurl.error as err:
                print(err)

    def is_robot_good(self, url):
        schema, domain, path, query = normalize_link(url)
        self.cache_robot(domain)
        res = True
        if domain in self.robots and self.robots[domain] is not None:
            res = self.robots[domain].can_fetch("Agent", url)
        return res

class ParsedDocument:
    """
    One document in the database
    """
    def __init__(self, parser,work_link):
        self.parser = parser
        self.work_link = work_link

        self.content = None 
        self.bs = None
        self.paragraph_checksums = None
        self.paragraph_sizes = None
        
        self.link_set = set()
        self.body = None
        self.text_date = None
        self.tags = None
        self.authors = None
        self.title = None
        self.description = None
        self.section = None
        self.article_published_time = None
        self.current_time = datetime.date.today()

    def extract(self,content,bs):
        """
        Parse content and fill the object
        """
        self.content = content
        self.bs = bs

        # Extract text and metatext
        self.body, self.text_date = self.parser.extract_raw_text(content, self.current_time)
        # Paragraph Checksums
        pch,pszs = self.parser.calculate_checksums(self.body) 
        self.paragraph_checksums = pch
        self.paragraph_sizes = pszs
        if bs is  None:
            return
        self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs)

        # Extrakcia linkov zo stranky
        base = self.work_link
        if bs.base is not None and "href" in bs.base.attrs:
            base = bs.base["href"]
        # Normalizacia linkov
        for l in bs.find_all("a", href=True):
            if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
                continue
            href = l["href"]
            try:
                nl = normalize_link(href, base)
                link = urlunparse(nl)
                if link == base:
                    continue
                self.link_set.add(link)
            except ValueError:
                pass

    def get_links(self):
        """
        @return all links
        """
        return self.link_set

    def get_follow_links(self):
        """
        @return good normalized links
        """
        follow_links = set()
        for l in self.link_set:
            if self.parser.is_link_good(l):
                link = normalize_link(l,strip_query=self.parser.strip_query)
                follow_links.add(urlunparse(link))
        return follow_links


    def __str__(self):
        r = []
        if self.title is not None:
            r.append(self.title)
        if self.body is not None:
            if (len(self.body) < 20):
                r.append(self.body)
            else:
                r.append(self.body[0:20] + " ....")
        return ">>> ".join(r)


def get_domains(arg):
    """
    Get domains from argument or stdin
    if arg is -, get from stdin, else split arg
    @param arg dash or domains separated by comma
    @return domains
    """
    domains = []
    if arg == "-":
        for l in sys.stdin:
            domain = l.rstrip()
            assert(domain is not None)
            if len(domain) == 0:
                continue
            domains.append(domain)
    else:
        domains = arg.split(",")
    return domains

def parse_and_index(work_link,parser,responses,db):
    """
    Take all responses from work link, parse and store in db
    @param work_link - final link from downloader
    @param parser to use
    @param responses from the downloader
    @param db 
    """
    target_link = work_link
    links = []
    if len(responses) > 0:
        db.index_responses(work_link,responses)
        lr = responses[-1]
        if lr.bs is not None:
            target_link = lr.get_canonical()
            parsed = ParsedDocument(parser,target_link)
            parsed.extract(lr.content, lr.bs)
            db.index_content(target_link,parsed)
            links = parsed.get_links()
    return target_link,links

def visit_sitemap(domain,connection,parser,db):
    """
    get links from sitemap of the domain
    """
    link = "http://" + domain
    print("Sitemap visit: " + link)
    responses = connection.html_download2(link)
    if len(responses) == 0:
        return False
    lr = responses[-1]
    if lr.bs is None:
        return False

    target_link,outlinks = parse_and_index(link,parser,responses,db)
    if len(outlinks) > 0:
        db.index_follow_links(parser,outlinks,connection)
    return True


def visit_links(links,connection,parser,db,is_online):
    """
    if the site is not online, then just check links
    """
    outlinks = []
    junklinks = []
    badrobotlinks = []
    for work_link in links:
        responses = []
        if not parser.is_link_good(work_link):
            db.update_link_status(work_link,"bad_link")
        elif is_online and not connection.is_robot_good(work_link):
            db.update_link_status(work_link,"bad_robot")
        elif is_online:
            responses = connection.html_download2(work_link)
            target_link,links = parse_and_index(work_link,parser,responses,db)
            nl = normalize_link(target_link)
            connection.crawl_delay(nl[1])
            outlinks += links
    if len(outlinks) > 0:
        db.index_follow_links(parser,outlinks,connection)

def visit_domain(domain,parser,db):
    """
    One visit of the domain

    1.Get links from the frontpage,
    2. visit links and extract new links
    3. get new links to visit
    4. repeat visit for parser.crawl_rounds
    """
    c = Connection()
    p = parser
    # Get links from frontpage
    # TODO Sitemap
    is_online = False
    if parser.is_domain_good(domain):
        # Is domain online?
        is_online = visit_sitemap(domain,c,parser,db) 
    if is_online:
        for i in range(p.crawl_rounds):
            # Visit links from frontpage
            links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
            visit_links(links,c,p,db,is_online)
            db.check_domain(domain)
    else:
        db.check_domain(domain)
    return True

def process_domains(domains,visit,parser,db,queue):
    """
    Visit all domains in list.
    if queue is true, then queue domain instead immediate visit
    """
    print("Websucker Agenda>>")
    random.shuffle(domains)
    for domain in domains:
        assert len(domain[0]) > 1
        print(domain)
    if queue is not None:
        print("Queuing:")
        for domain in domains:
            print(domain)
            queue.put(domain[0])
        queue.close()
    if visit:
        print("Visiting:")
        for domain in domains:
            print(domain)
            visit_domain(domain[0],parser,db)

def work_domains(parser,db,queue):
    """
    Poll the queue and visit
    """
    while True:
        print("Waiting for a new job:")
        job = queue.reserve()
        domain = job.body
        queue.bury(job)
        print("Visiting:")
        visit_domain(domain,parser,db)
        queue.delete(job)
initial 2020-05-07 14:09:45 +00:00			`#!/usr/bin/env python`
			`#! -- coding: utf-8 --`
			`import urllib.parse`
			`import urllib.error`
			`import os`
			`import os.path`
			`import re`
			`import datetime`
			`import time`
			`import sys`
			`import tempfile`
			`import pprint`
			`import bs4`

			`import pycurl`
			`import urllib.robotparser`
zz 2020-05-09 09:50:50 +00:00			`import collections`
zz 2020-06-08 14:09:47 +00:00			`import random`
initial 2020-05-07 14:09:45 +00:00

			`from websucker.parser import normalize_link,urlunparse`


			`# Parses http refresh in header or on html meta`
			`def get_refresh(ref,target_link):`
			`refresh = None`
			`tokens = ref.strip().split(";")`
			`if len(tokens) > 1 and tokens[1].lower().startswith("url="):`
			`refresh = urlunparse(normalize_link(`
			`tokens[1][4:].strip("\'"), target_link))`
			`return refresh`

			`class Response:`
			`def __init__(self,url,headers,status,content,redirects,link_status):`
			`assert len(url) > 0`
			`assert url[0] != "/"`
			`self.url = url`
			`self.status = status`
			`self.content = content`
			`self.headers = headers`
			`self.redirects = redirects`
			`self.visited_time = datetime.date.today()`
			`self.bs = None`
			`if content is not None and link_status == "good":`
unvisited strategy 2020-06-06 09:29:36 +00:00			`try:`
			`self.bs = bs4.BeautifulSoup(content, "lxml")`
			`except ValueError:`
			`link_status = "bad_parse"`
			`self.link_status = link_status`
initial 2020-05-07 14:09:45 +00:00
			`def __str__(self):`
			`return "{} {} {}".format(self.url,self.get_canonical(),self.link_status)`

			`def get_content(self):`
			`if self.content is None:`
			`print("NO CONTENT")`
			`print(self.url,self.redirects)`
			`return None`
			`self.content.seek(0)`
			`text = self.content.read()`
			`out = str(text,encoding="utf8",errors="replace")`
			`return out`


			`# HMTL metarefresh redirect`
			`def get_metarefresh(self):`
zz 2020-05-10 09:48:17 +00:00			`if self.bs is None:`
initial 2020-05-07 14:09:45 +00:00			`return None`
			`metarefresh = None`
			`t = self.bs.find_all("meta", attrs={"http-equiv": "refresh"})`
			`canonical = self.get_canonical()`
			`for tags in t:`
			`if "content" in tags:`
			`metarefresh = get_refresh(tags["content"],canonical)`
			`if metarefresh is not None:`
			`nl = normalize_link(metarefresh, canonical)`
			`print("Metarefresh")`
			`print(nl)`
			`metarefresh = urlunparse(nl)`

			`return metarefresh`

			`def get_canonical(self):`
			`r = None`
			`last_link = self.url`
			`if len(self.redirects) > 0:`
			`last_link = self.redirects[-1]`
			`if self.bs is not None:`
			`l = self.bs.find("link", rel="canonical", href=True)`
			`if l is not None:`
			`r = urlunparse(normalize_link(l["href"], last_link))`
			`if r is None:`
			`r = last_link`
			`r = urlunparse(normalize_link(r, last_link))`
			`assert len(r) > 0`
			`assert r[0] != "/"`
			`return r`

			`def get_redirects(self):`
			`if len(self.redirects) <2 :`
			`return []`
			`return self.redirects[0:-1]`


			`class Connection:`
			`def __init__(self):`
zz 2020-05-09 09:50:50 +00:00			`self.useragent = "Googlebot-News"`
initial 2020-05-07 14:09:45 +00:00			`self.c = pycurl.Curl()`
			`self.c.setopt(self.c.FOLLOWLOCATION, True)`
			`# self.c.setopt(self.c.VERBOSE, True)`
			`self.c.setopt(self.c.CONNECTTIMEOUT, 20)`
			`self.c.setopt(self.c.TIMEOUT, 20)`
			`self.c.setopt(self.c.FAILONERROR, True)`
			`self.c.setopt(self.c.HTTPHEADER, [`
			`'Accept: text/html', 'Accept-Charset: UTF-8'])`
			`self.c.setopt(self.c.HEADERFUNCTION, self.header)`
zz 2020-05-09 09:50:50 +00:00			`self.c.setopt(self.c.USERAGENT,self.useragent )`
initial 2020-05-07 14:09:45 +00:00			`# #self.c.setopt(pycurl.COOKIEJAR, 'cookie.txt')`
			`# #self.c.setopt(pycurl.COOKIEFILE, 'cookie.txt')`
			`self.robots = {}`
			`self.headers = {}`
			`self.redirects = []`
			`self.header_lines = []`
			`self.status = 0`
			`self.max_redirect = 4`

			`# Zastavi spracovanie ak content nie je text`
			`# zaznamena location a refresh`
			`def header(self, data):`
			`if len(data) == 0:`
			`return None`
			`l = str(data, encoding="utf8")`
			`self.header_lines.append(l)`
			`s = l.find(" ")`
			`if s >= 1 and s < len(l):`
			`key = l[0:s - 1]`
			`value = l[s + 1:].rstrip()`
			`self.headers[key] = value`
zz 2020-05-10 09:48:17 +00:00			`kl = key.lower()`
			`if kl == "refresh":`
initial 2020-05-07 14:09:45 +00:00			`self.add_redirect(value)`
zz 2020-05-10 09:48:17 +00:00			`elif kl == "location":`
initial 2020-05-07 14:09:45 +00:00			`self.add_redirect(value)`
zz 2020-05-10 09:48:17 +00:00			`elif kl == "content-type" and "text" not in value:`
initial 2020-05-07 14:09:45 +00:00			`# Pycurl potom vyhodi 23, failed writing header`
			`return 0`

zz 2020-05-09 09:50:50 +00:00			`def crawl_delay(self,domain):`
			`self.cache_robot(domain)`
			`delay = 4`
			`if domain in self.robots:`
			`r = self.robots[domain]`
			`if r is not None:`
			`d = r.crawl_delay(self.useragent)`
			`if d is not None:`
			`delay = d`
			`print("Waiting for {} s".format(delay))`
			`time.sleep(delay)`

initial 2020-05-07 14:09:45 +00:00			`def __del__(self):`
			`self.c.close()`

			`def close(self):`
			`self.c.close()`

			`def add_redirect(self,link):`
			`last_link = self.url`
			`if len(self.redirects) > 0:`
			`last_link = self.redirects[-1]`
			`v = urlunparse(normalize_link(link, last_link))`
			`if v!=last_link and v not in set(self.redirects):`
			`self.redirects.append(v)`

			`"""`
			`@returns content, link_status`
			`@throws pycurl.error`
			`"""`
			`def _download(self, url):`
			`print("Downloading " + url)`
			`self.url = url`
			`self.headers = {}`
			`self.redirects = []`
			`self.header_lines = []`
			`self.status = 0`
			`content = None`
			`link_status = "bad_connection"`
			`try:`
			`self.headers = {}`
			`del self.header_lines[:]`
			`content = tempfile.SpooledTemporaryFile()`
			`self.c.setopt(self.c.WRITEDATA, content)`
			`self.c.setopt(self.c.URL, url)`
			`self.c.perform()`
			`self.status = self.c.getinfo(self.c.RESPONSE_CODE)`
			`if self.status != 200:`
			`link_status = "bad_httpcode"`
			`elif "Content-Type" in self.headers and not self.headers["Content-Type"].startswith("text"):`
			`link_status = "bad_type"`
			`else:`
			`link_status = "good"`
			`content.seek(0)`
			`except pycurl.error as e:`
			`errno, message = e.args`
			`content = None`
			`self.status = self.c.getinfo(self.c.RESPONSE_CODE)`
			`if errno == 23:`
			`# 23 je zly content v header`
			`link_status = "bad_type"`
			`elif errno == 22:`
			`link_status = "bad_httpcode"`
zz 2020-05-09 09:50:50 +00:00			`elif errno == 28:`
			`# 28 je connection timeout`
			`link_status = "bad_connection"`
			`elif errno == 60:`
			`# 60 bad ssl certificate`
			`link_status = "bad_connection"`
zz 2020-05-10 09:48:17 +00:00			`elif errno == 56:`
			`# 56 Connection reset by peer`
			`link_status = "bad_connection"`
zz 2020-05-09 09:50:50 +00:00			`elif errno == 16:`
			`# 16 HTTP2`
			`link_status = "bad_connection"`
zz 2020-05-11 14:43:39 +00:00			`elif errno == 92:`
			`# 92 HTTP2 not closed`
			`link_status = "bad_connection"`
zz 2020-05-09 09:50:50 +00:00			`elif errno == 6:`
			`# 60 Unable to resolve dns`
			`link_status = "bad_connection"`
zz 2020-05-10 09:48:17 +00:00			`elif errno == 7:`
			`# 7 Connection refused`
			`link_status = "bad_connection"`
initial 2020-05-07 14:09:45 +00:00			`else:`
zz 2020-05-13 13:20:20 +00:00			`link_status = "bad_connection"`
			`#raise e`
initial 2020-05-07 14:09:45 +00:00			`except UnicodeDecodeError as e:`
			`content = None`
			`link_status = "bad_unicode"`
			`except UnicodeEncodeError as e:`
			`content = None`
			`link_status = "bad_unicode"`
			`sz = self.c.getinfo(self.c.SIZE_DOWNLOAD)`
			`tt = self.c.getinfo(self.c.TOTAL_TIME)`
			`print("{} Received {} bytes in {} s".format(self.status,sz,tt))`
			`return content, link_status`

			`# Throws pycurl.error`
			`def html_download2(self, url):`
			`dlink = url`
			`responses = []`
			`while len(responses) < 5:`
			`nl = normalize_link(dlink)`
			`url = urlunparse(nl)`
			`assert url.startswith("http")`
			`content, link_status = self._download(url)`
			`response = Response(url,"\r\n".join(self.header_lines),self.status,content,self.redirects,link_status)`
			`dlink = response.get_metarefresh()`
			`responses.append(response)`
			`if dlink is None:`
			`break`
			`return responses`

zz 2020-05-09 09:50:50 +00:00			`def cache_robot(self,domain):`
initial 2020-05-07 14:09:45 +00:00			`if domain not in self.robots:`
zz 2020-05-09 09:50:50 +00:00			`roboturl = urlunparse(("https", domain, "robots.txt", ""))`
initial 2020-05-07 14:09:45 +00:00			`try:`
			`r = self._download(roboturl)`
			`if r[1] == "good":`
			`c = r[0].read()`
			`lines = str(c, errors="ignore", encoding="utf8").split("\n")`
			`self.robots[domain] = urllib.robotparser.RobotFileParser()`
			`self.robots[domain].parse(lines)`
			`else:`
			`self.robots[domain] = None`
			`except pycurl.error as err:`
			`print(err)`
zz 2020-05-09 09:50:50 +00:00
			`def is_robot_good(self, url):`
			`schema, domain, path, query = normalize_link(url)`
			`self.cache_robot(domain)`
			`res = True`
initial 2020-05-07 14:09:45 +00:00			`if domain in self.robots and self.robots[domain] is not None:`
			`res = self.robots[domain].can_fetch("Agent", url)`
			`return res`

			`class ParsedDocument:`
fix 2021-01-20 11:06:03 +00:00			`"""`
			`One document in the database`
			`"""`
initial 2020-05-07 14:09:45 +00:00			`def __init__(self, parser,work_link):`
			`self.parser = parser`
			`self.work_link = work_link`

			`self.content = None`
			`self.bs = None`
			`self.paragraph_checksums = None`
			`self.paragraph_sizes = None`

			`self.link_set = set()`
			`self.body = None`
			`self.text_date = None`
			`self.tags = None`
			`self.authors = None`
			`self.title = None`
			`self.description = None`
			`self.section = None`
			`self.article_published_time = None`
			`self.current_time = datetime.date.today()`

			`def extract(self,content,bs):`
fix 2021-01-20 11:06:03 +00:00			`"""`
			`Parse content and fill the object`
			`"""`
initial 2020-05-07 14:09:45 +00:00			`self.content = content`
			`self.bs = bs`

			`# Extract text and metatext`
			`self.body, self.text_date = self.parser.extract_raw_text(content, self.current_time)`
			`# Paragraph Checksums`
			`pch,pszs = self.parser.calculate_checksums(self.body)`
			`self.paragraph_checksums = pch`
			`self.paragraph_sizes = pszs`
			`if bs is None:`
			`return`
zz 2020-05-10 09:48:17 +00:00			`self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs)`
initial 2020-05-07 14:09:45 +00:00
			`# Extrakcia linkov zo stranky`
			`base = self.work_link`
			`if bs.base is not None and "href" in bs.base.attrs:`
			`base = bs.base["href"]`
			`# Normalizacia linkov`
			`for l in bs.find_all("a", href=True):`
			`if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:`
			`continue`
			`href = l["href"]`
			`try:`
			`nl = normalize_link(href, base)`
			`link = urlunparse(nl)`
			`if link == base:`
			`continue`
			`self.link_set.add(link)`
			`except ValueError:`
			`pass`

			`def get_links(self):`
fix 2021-01-20 11:06:03 +00:00			`"""`
			`@return all links`
			`"""`
initial 2020-05-07 14:09:45 +00:00			`return self.link_set`

			`def get_follow_links(self):`
fix 2021-01-20 11:06:03 +00:00			`"""`
			`@return good normalized links`
			`"""`
initial 2020-05-07 14:09:45 +00:00			`follow_links = set()`
			`for l in self.link_set:`
			`if self.parser.is_link_good(l):`
			`link = normalize_link(l,strip_query=self.parser.strip_query)`
			`follow_links.add(urlunparse(link))`
			`return follow_links`


			`def __str__(self):`
			`r = []`
			`if self.title is not None:`
			`r.append(self.title)`
			`if self.body is not None:`
			`if (len(self.body) < 20):`
			`r.append(self.body)`
			`else:`
zz 2021-01-20 08:56:53 +00:00			`r.append(self.body[0:20] + " ....")`
initial 2020-05-07 14:09:45 +00:00			`return ">>> ".join(r)`


			`def get_domains(arg):`
fix 2021-01-20 11:06:03 +00:00			`"""`
			`Get domains from argument or stdin`
			`if arg is -, get from stdin, else split arg`
			`@param arg dash or domains separated by comma`
			`@return domains`
			`"""`
initial 2020-05-07 14:09:45 +00:00			`domains = []`
			`if arg == "-":`
			`for l in sys.stdin:`
			`domain = l.rstrip()`
			`assert(domain is not None)`
			`if len(domain) == 0:`
			`continue`
			`domains.append(domain)`
			`else:`
			`domains = arg.split(",")`
			`return domains`

zz 2020-05-09 09:50:50 +00:00			`def parse_and_index(work_link,parser,responses,db):`
fix 2021-01-20 11:06:03 +00:00			`"""`
			`Take all responses from work link, parse and store in db`
			`@param work_link - final link from downloader`
			`@param parser to use`
			`@param responses from the downloader`
			`@param db`
			`"""`
zz 2020-05-09 09:50:50 +00:00			`target_link = work_link`
			`links = []`
			`if len(responses) > 0:`
			`db.index_responses(work_link,responses)`
			`lr = responses[-1]`
unvisited strategy 2020-06-06 09:29:36 +00:00			`if lr.bs is not None:`
zz 2020-05-09 09:50:50 +00:00			`target_link = lr.get_canonical()`
			`parsed = ParsedDocument(parser,target_link)`
			`parsed.extract(lr.content, lr.bs)`
			`db.index_content(target_link,parsed)`
			`links = parsed.get_links()`
			`return target_link,links`

			`def visit_sitemap(domain,connection,parser,db):`
fix 2021-01-20 11:06:03 +00:00			`"""`
			`get links from sitemap of the domain`
			`"""`
zz 2020-05-09 09:50:50 +00:00			`link = "http://" + domain`
unvisited strategy 2020-06-06 09:29:36 +00:00			`print("Sitemap visit: " + link)`
zz 2020-05-09 09:50:50 +00:00			`responses = connection.html_download2(link)`
			`if len(responses) == 0:`
			`return False`
			`lr = responses[-1]`
unvisited strategy 2020-06-06 09:29:36 +00:00			`if lr.bs is None:`
zz 2020-05-09 09:50:50 +00:00			`return False`

			`target_link,outlinks = parse_and_index(link,parser,responses,db)`
			`if len(outlinks) > 0:`
			`db.index_follow_links(parser,outlinks,connection)`
			`return True`


unvisited strategy 2020-06-06 09:29:36 +00:00			`def visit_links(links,connection,parser,db,is_online):`
fix 2021-01-20 11:06:03 +00:00			`"""`
			`if the site is not online, then just check links`
			`"""`
initial 2020-05-07 14:09:45 +00:00			`outlinks = []`
zz 2020-06-04 11:44:22 +00:00			`junklinks = []`
			`badrobotlinks = []`
initial 2020-05-07 14:09:45 +00:00			`for work_link in links:`
			`responses = []`
zz 2020-06-04 11:44:22 +00:00			`if not parser.is_link_good(work_link):`
unvisited strategy 2020-06-06 09:29:36 +00:00			`db.update_link_status(work_link,"bad_link")`
			`elif is_online and not connection.is_robot_good(work_link):`
			`db.update_link_status(work_link,"bad_robot")`
			`elif is_online:`
initial 2020-05-07 14:09:45 +00:00			`responses = connection.html_download2(work_link)`
zz 2020-05-09 09:50:50 +00:00			`target_link,links = parse_and_index(work_link,parser,responses,db)`
			`nl = normalize_link(target_link)`
			`connection.crawl_delay(nl[1])`
			`outlinks += links`
initial 2020-05-07 14:09:45 +00:00			`if len(outlinks) > 0:`
			`db.index_follow_links(parser,outlinks,connection)`

			`def visit_domain(domain,parser,db):`
fix 2021-01-20 11:06:03 +00:00			`"""`
			`One visit of the domain`

			`1.Get links from the frontpage,`
			`2. visit links and extract new links`
			`3. get new links to visit`
			`4. repeat visit for parser.crawl_rounds`
			`"""`
initial 2020-05-07 14:09:45 +00:00			`c = Connection()`
			`p = parser`
			`# Get links from frontpage`
			`# TODO Sitemap`
unvisited strategy 2020-06-06 09:29:36 +00:00			`is_online = False`
			`if parser.is_domain_good(domain):`
			`# Is domain online?`
			`is_online = visit_sitemap(domain,c,parser,db)`
zz 2020-06-08 14:09:47 +00:00			`if is_online:`
			`for i in range(p.crawl_rounds):`
			`# Visit links from frontpage`
			`links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)`
			`visit_links(links,c,p,db,is_online)`
			`db.check_domain(domain)`
			`else:`
initial 2020-05-07 14:09:45 +00:00			`db.check_domain(domain)`
zz 2020-05-09 09:50:50 +00:00			`return True`

			`def process_domains(domains,visit,parser,db,queue):`
fix 2021-01-20 11:06:03 +00:00			`"""`
			`Visit all domains in list.`
			`if queue is true, then queue domain instead immediate visit`
			`"""`
zz 2020-05-09 09:50:50 +00:00			`print("Websucker Agenda>>")`
zz 2020-06-08 14:09:47 +00:00			`random.shuffle(domains)`
zz 2020-05-09 09:50:50 +00:00			`for domain in domains:`
zz 2020-06-04 11:44:22 +00:00			`assert len(domain[0]) > 1`
zz 2020-05-09 09:50:50 +00:00			`print(domain)`
			`if queue is not None:`
			`print("Queuing:")`
			`for domain in domains:`
			`print(domain)`
			`queue.put(domain[0])`
zz 2020-06-04 11:44:22 +00:00			`queue.close()`
zz 2020-05-09 09:50:50 +00:00			`if visit:`
			`print("Visiting:")`
			`for domain in domains:`
			`print(domain)`
			`visit_domain(domain[0],parser,db)`

			`def work_domains(parser,db,queue):`
fix 2021-01-20 11:06:03 +00:00			`"""`
			`Poll the queue and visit`
			`"""`
zz 2020-05-09 09:50:50 +00:00			`while True:`
			`print("Waiting for a new job:")`
			`job = queue.reserve()`
			`domain = job.body`
			`queue.bury(job)`
			`print("Visiting:")`
			`visit_domain(domain,parser,db)`
			`queue.delete(job)`