fix

2021-01-20 12:06:03 +01:00 · 2021-01-20 12:06:03 +01:00 · fb0c1d6eb7
commit fb0c1d6eb7
parent e287236033
1 changed files with 46 additions and 1 deletions
--- a/websucker/agent.py
+++ b/websucker/agent.py
@ -283,6 +283,9 @@ class Connection:
        return res

 class ParsedDocument:
+    """
+    One document in the database
+    """
    def __init__(self, parser,work_link):
        self.parser = parser
        self.work_link = work_link
@ -304,6 +307,9 @@ class ParsedDocument:
        self.current_time = datetime.date.today()

    def extract(self,content,bs):
+        """
+        Parse content and fill the object
+        """
        self.content = content
        self.bs = bs

@ -336,9 +342,15 @@ class ParsedDocument:
                pass

    def get_links(self):
+        """
+        @return all links
+        """
        return self.link_set

    def get_follow_links(self):
+        """
+        @return good normalized links
+        """
        follow_links = set()
        for l in self.link_set:
            if self.parser.is_link_good(l):
@ -360,6 +372,12 @@ class ParsedDocument:


 def get_domains(arg):
+    """
+    Get domains from argument or stdin
+    if arg is -, get from stdin, else split arg
+    @param arg dash or domains separated by comma
+    @return domains
+    """
    domains = []
    if arg == "-":
        for l in sys.stdin:
@ -373,6 +391,13 @@ def get_domains(arg):
    return domains

 def parse_and_index(work_link,parser,responses,db):
+    """
+    Take all responses from work link, parse and store in db
+    @param work_link - final link from downloader
+    @param parser to use
+    @param responses from the downloader
+    @param db 
+    """
    target_link = work_link
    links = []
    if len(responses) > 0:
@ -387,6 +412,9 @@ def parse_and_index(work_link,parser,responses,db):
    return target_link,links

 def visit_sitemap(domain,connection,parser,db):
+    """
+    get links from sitemap of the domain
+    """
    link = "http://" + domain
    print("Sitemap visit: " + link)
    responses = connection.html_download2(link)
@ -403,7 +431,9 @@ def visit_sitemap(domain,connection,parser,db):


 def visit_links(links,connection,parser,db,is_online):
-    # is is not online, then just check links
+    """
+    if the site is not online, then just check links
+    """
    outlinks = []
    junklinks = []
    badrobotlinks = []
@ -423,6 +453,14 @@ def visit_links(links,connection,parser,db,is_online):
        db.index_follow_links(parser,outlinks,connection)

 def visit_domain(domain,parser,db):
+    """
+    One visit of the domain
+
+    1.Get links from the frontpage,
+    2. visit links and extract new links
+    3. get new links to visit
+    4. repeat visit for parser.crawl_rounds
+    """
    c = Connection()
    p = parser
    # Get links from frontpage
@ -442,6 +480,10 @@ def visit_domain(domain,parser,db):
    return True

 def process_domains(domains,visit,parser,db,queue):
+    """
+    Visit all domains in list.
+    if queue is true, then queue domain instead immediate visit
+    """
    print("Websucker Agenda>>")
    random.shuffle(domains)
    for domain in domains:
@ -460,6 +502,9 @@ def process_domains(domains,visit,parser,db,queue):
            visit_domain(domain[0],parser,db)

 def work_domains(parser,db,queue):
+    """
+    Poll the queue and visit
+    """
    while True:
        print("Waiting for a new job:")
        job = queue.reserve()