unvisited strategy
This commit is contained in:
		
							parent
							
								
									3687403184
								
							
						
					
					
						commit
						75e1b0cd6d
					
				@ -40,9 +40,12 @@ class Response:
 | 
				
			|||||||
        self.redirects = redirects
 | 
					        self.redirects = redirects
 | 
				
			||||||
        self.visited_time = datetime.date.today()
 | 
					        self.visited_time = datetime.date.today()
 | 
				
			||||||
        self.bs = None
 | 
					        self.bs = None
 | 
				
			||||||
        self.link_status = link_status
 | 
					 | 
				
			||||||
        if content is not None and link_status == "good":
 | 
					        if content is not None and link_status == "good":
 | 
				
			||||||
            self.bs = bs4.BeautifulSoup(content, "lxml")
 | 
					            try:
 | 
				
			||||||
 | 
					                self.bs = bs4.BeautifulSoup(content, "lxml")
 | 
				
			||||||
 | 
					            except ValueError:
 | 
				
			||||||
 | 
					                link_status = "bad_parse"
 | 
				
			||||||
 | 
					        self.link_status = link_status
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __str__(self):
 | 
					    def __str__(self):
 | 
				
			||||||
        return "{} {} {}".format(self.url,self.get_canonical(),self.link_status)
 | 
					        return "{} {} {}".format(self.url,self.get_canonical(),self.link_status)
 | 
				
			||||||
@ -374,7 +377,7 @@ def parse_and_index(work_link,parser,responses,db):
 | 
				
			|||||||
    if len(responses) > 0:
 | 
					    if len(responses) > 0:
 | 
				
			||||||
        db.index_responses(work_link,responses)
 | 
					        db.index_responses(work_link,responses)
 | 
				
			||||||
        lr = responses[-1]
 | 
					        lr = responses[-1]
 | 
				
			||||||
        if lr.content is not None:
 | 
					        if lr.bs is not None:
 | 
				
			||||||
            target_link = lr.get_canonical()
 | 
					            target_link = lr.get_canonical()
 | 
				
			||||||
            parsed = ParsedDocument(parser,target_link)
 | 
					            parsed = ParsedDocument(parser,target_link)
 | 
				
			||||||
            parsed.extract(lr.content, lr.bs)
 | 
					            parsed.extract(lr.content, lr.bs)
 | 
				
			||||||
@ -384,11 +387,12 @@ def parse_and_index(work_link,parser,responses,db):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
def visit_sitemap(domain,connection,parser,db):
 | 
					def visit_sitemap(domain,connection,parser,db):
 | 
				
			||||||
    link = "http://" + domain
 | 
					    link = "http://" + domain
 | 
				
			||||||
 | 
					    print("Sitemap visit: " + link)
 | 
				
			||||||
    responses = connection.html_download2(link)
 | 
					    responses = connection.html_download2(link)
 | 
				
			||||||
    if len(responses) == 0:
 | 
					    if len(responses) == 0:
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
    lr = responses[-1]
 | 
					    lr = responses[-1]
 | 
				
			||||||
    if lr.link_status.startswith("bad_"):
 | 
					    if lr.bs is None:
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    target_link,outlinks = parse_and_index(link,parser,responses,db)
 | 
					    target_link,outlinks = parse_and_index(link,parser,responses,db)
 | 
				
			||||||
@ -397,17 +401,18 @@ def visit_sitemap(domain,connection,parser,db):
 | 
				
			|||||||
    return True
 | 
					    return True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def visit_links(links,connection,parser,db):
 | 
					def visit_links(links,connection,parser,db,is_online):
 | 
				
			||||||
 | 
					    # is is not online, then just check links
 | 
				
			||||||
    outlinks = []
 | 
					    outlinks = []
 | 
				
			||||||
    junklinks = []
 | 
					    junklinks = []
 | 
				
			||||||
    badrobotlinks = []
 | 
					    badrobotlinks = []
 | 
				
			||||||
    for work_link in links:
 | 
					    for work_link in links:
 | 
				
			||||||
        responses = []
 | 
					        responses = []
 | 
				
			||||||
        if not parser.is_link_good(work_link):
 | 
					        if not parser.is_link_good(work_link):
 | 
				
			||||||
            db.update_link_status(parser,work_link,"junk")
 | 
					            db.update_link_status(work_link,"bad_link")
 | 
				
			||||||
        elif connection.is_robot_good(work_link):
 | 
					        elif is_online and not connection.is_robot_good(work_link):
 | 
				
			||||||
            db.update_link_status(parser,work_link,"bad_robot")
 | 
					            db.update_link_status(work_link,"bad_robot")
 | 
				
			||||||
        else:
 | 
					        elif is_online:
 | 
				
			||||||
            responses = connection.html_download2(work_link)
 | 
					            responses = connection.html_download2(work_link)
 | 
				
			||||||
            target_link,links = parse_and_index(work_link,parser,responses,db)
 | 
					            target_link,links = parse_and_index(work_link,parser,responses,db)
 | 
				
			||||||
            nl = normalize_link(target_link)
 | 
					            nl = normalize_link(target_link)
 | 
				
			||||||
@ -421,13 +426,14 @@ def visit_domain(domain,parser,db):
 | 
				
			|||||||
    p = parser
 | 
					    p = parser
 | 
				
			||||||
    # Get links from frontpage
 | 
					    # Get links from frontpage
 | 
				
			||||||
    # TODO Sitemap
 | 
					    # TODO Sitemap
 | 
				
			||||||
    res = visit_sitemap(domain,c,parser,db) 
 | 
					    is_online = False
 | 
				
			||||||
    if not res:
 | 
					    if parser.is_domain_good(domain):
 | 
				
			||||||
        return False
 | 
					        # Is domain online?
 | 
				
			||||||
 | 
					        is_online = visit_sitemap(domain,c,parser,db) 
 | 
				
			||||||
    for i in range(p.crawl_rounds):
 | 
					    for i in range(p.crawl_rounds):
 | 
				
			||||||
        # Visit links from frontpage
 | 
					        # Visit links from frontpage
 | 
				
			||||||
        links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
 | 
					        links = db.get_visit_links(domain,p.recent_links,p.old_links,p.random_links)
 | 
				
			||||||
        visit_links(links,c,p,db)
 | 
					        visit_links(links,c,p,db,is_online)
 | 
				
			||||||
        db.check_domain(domain)
 | 
					        db.check_domain(domain)
 | 
				
			||||||
    return True
 | 
					    return True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -5,6 +5,8 @@ import os
 | 
				
			|||||||
import pkg_resources
 | 
					import pkg_resources
 | 
				
			||||||
import datetime
 | 
					import datetime
 | 
				
			||||||
from websucker.parser import normalize_link,urlunparse
 | 
					from websucker.parser import normalize_link,urlunparse
 | 
				
			||||||
 | 
					import collections
 | 
				
			||||||
 | 
					import math
 | 
				
			||||||
 | 
					
 | 
				
			||||||
VERSION = "sucker6"
 | 
					VERSION = "sucker6"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -140,14 +142,16 @@ INSERT INTO content(
 | 
				
			|||||||
    def index_responses(self,source_link,responses):
 | 
					    def index_responses(self,source_link,responses):
 | 
				
			||||||
        # Redirect links
 | 
					        # Redirect links
 | 
				
			||||||
        pl = normalize_link(source_link)
 | 
					        pl = normalize_link(source_link)
 | 
				
			||||||
 | 
					        domain = pl[1]
 | 
				
			||||||
 | 
					        npl = urlunparse(pl)
 | 
				
			||||||
        for response in responses:
 | 
					        for response in responses:
 | 
				
			||||||
            tl = response.get_canonical()
 | 
					            tl = response.get_canonical()
 | 
				
			||||||
            if pl != tl:
 | 
					            if npl != tl:
 | 
				
			||||||
                self.update_link_status(source_link,"redirect",tl)
 | 
					                self.update_link_status(npl,"redirect",tl)
 | 
				
			||||||
            d = (
 | 
					            d = (
 | 
				
			||||||
                pl[1],
 | 
					                domain,
 | 
				
			||||||
                source_link,
 | 
					                npl,
 | 
				
			||||||
                response.get_canonical(),
 | 
					                tl,
 | 
				
			||||||
                response.redirects,
 | 
					                response.redirects,
 | 
				
			||||||
                response.status,
 | 
					                response.status,
 | 
				
			||||||
                response.headers,
 | 
					                response.headers,
 | 
				
			||||||
@ -166,8 +170,16 @@ INSERT INTO content(
 | 
				
			|||||||
        fd = 0
 | 
					        fd = 0
 | 
				
			||||||
        jd = 0
 | 
					        jd = 0
 | 
				
			||||||
        rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1")
 | 
					        rows = self.session.execute("SELECT domain_name,good_size,content_size, fetched_count,seen_count FROM domain_quality PER PARTITION LIMIT 1")
 | 
				
			||||||
 | 
					        # TODO submdomain analysis
 | 
				
			||||||
 | 
					        #dd = collections.defaultdict(set)
 | 
				
			||||||
        for row in rows:
 | 
					        for row in rows:
 | 
				
			||||||
            if not parser.is_domain_good(row[0]):
 | 
					            domain = row[0]
 | 
				
			||||||
 | 
					            #subdomains = domain.split(".")
 | 
				
			||||||
 | 
					            #d2 = subdomains[-2] + "." + subdomains[-1]
 | 
				
			||||||
 | 
					            #if len(subdomains) > 2:
 | 
				
			||||||
 | 
					            #    d3 = ".".join(subdomains[0:-2])
 | 
				
			||||||
 | 
					            #    dd[d2].add(d3)
 | 
				
			||||||
 | 
					            if not parser.is_domain_good(domain):
 | 
				
			||||||
                jd += 1
 | 
					                jd += 1
 | 
				
			||||||
            if row[1] is not None:
 | 
					            if row[1] is not None:
 | 
				
			||||||
                gs += row[1]
 | 
					                gs += row[1]
 | 
				
			||||||
@ -191,6 +203,9 @@ INSERT INTO content(
 | 
				
			|||||||
        print("Junk domains: {}".format(jd))
 | 
					        print("Junk domains: {}".format(jd))
 | 
				
			||||||
        print("New links : {}".format(sl))
 | 
					        print("New links : {}".format(sl))
 | 
				
			||||||
        print("Finished domains : {}".format(fd))
 | 
					        print("Finished domains : {}".format(fd))
 | 
				
			||||||
 | 
					        #for d,sd in dd.items():
 | 
				
			||||||
 | 
					        #    if len(sd) > 1:
 | 
				
			||||||
 | 
					        #        print(d + " " +  ",".join(sd))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def daily_report(self):
 | 
					    def daily_report(self):
 | 
				
			||||||
        #rows = self.session.execute(self.daily_links_select)
 | 
					        #rows = self.session.execute(self.daily_links_select)
 | 
				
			||||||
@ -222,8 +237,8 @@ INSERT INTO content(
 | 
				
			|||||||
                
 | 
					                
 | 
				
			||||||
        print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size))
 | 
					        print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def update_link_status(self,parser,links,status,redirect_target=None):
 | 
					    def update_link_status(self,link,status,redirect_target=None):
 | 
				
			||||||
        pl = normalize_link(source_link)
 | 
					        pl = normalize_link(link)
 | 
				
			||||||
        r = (
 | 
					        r = (
 | 
				
			||||||
            status,
 | 
					            status,
 | 
				
			||||||
            redirect_target,
 | 
					            redirect_target,
 | 
				
			||||||
@ -244,6 +259,7 @@ INSERT INTO content(
 | 
				
			|||||||
                follow_links.add(urlunparse(link))
 | 
					                follow_links.add(urlunparse(link))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        newlinkdomains = set()
 | 
					        newlinkdomains = set()
 | 
				
			||||||
 | 
					        newlinkcount = 0
 | 
				
			||||||
        for link in follow_links:
 | 
					        for link in follow_links:
 | 
				
			||||||
            value = []
 | 
					            value = []
 | 
				
			||||||
            nl = normalize_link(link)
 | 
					            nl = normalize_link(link)
 | 
				
			||||||
@ -253,8 +269,10 @@ INSERT INTO content(
 | 
				
			|||||||
            row = rows.one()
 | 
					            row = rows.one()
 | 
				
			||||||
            if row.applied:
 | 
					            if row.applied:
 | 
				
			||||||
                newlinkdomains.add(nl[1])
 | 
					                newlinkdomains.add(nl[1])
 | 
				
			||||||
 | 
					                newlinkcount += 1
 | 
				
			||||||
        for domain in newlinkdomains:
 | 
					        for domain in newlinkdomains:
 | 
				
			||||||
            self.check_domain(domain)
 | 
					            self.check_domain(domain)
 | 
				
			||||||
 | 
					        print("{} new links, {} new domains".format(newlinkcount,len(newlinkdomains)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def index_content(self,target_link,parsed_document):
 | 
					    def index_content(self,target_link,parsed_document):
 | 
				
			||||||
@ -306,7 +324,6 @@ INSERT INTO content(
 | 
				
			|||||||
            originality = self.check_document(pd.paragraph_checksums,pd.paragraph_sizes)
 | 
					            originality = self.check_document(pd.paragraph_checksums,pd.paragraph_sizes)
 | 
				
			||||||
            if originality < 0.8:
 | 
					            if originality < 0.8:
 | 
				
			||||||
                link_status = "bad_copy"
 | 
					                link_status = "bad_copy"
 | 
				
			||||||
        print(nl)
 | 
					 | 
				
			||||||
        self.session.execute(self.index_content_links_update,(link_status,originality,tsz,nl[0],nl[1],nl[2],nl[3]))
 | 
					        self.session.execute(self.index_content_links_update,(link_status,originality,tsz,nl[0],nl[1],nl[2],nl[3]))
 | 
				
			||||||
        content_future.result()
 | 
					        content_future.result()
 | 
				
			||||||
        print("<<<< " + link_status + " " + str(originality))
 | 
					        print("<<<< " + link_status + " " + str(originality))
 | 
				
			||||||
@ -428,7 +445,6 @@ INSERT INTO content(
 | 
				
			|||||||
        domain)
 | 
					        domain)
 | 
				
			||||||
        if fetched_count > 0 or seen_count > 0:
 | 
					        if fetched_count > 0 or seen_count > 0:
 | 
				
			||||||
            self.session.execute(self.domain_quality_update,uv)
 | 
					            self.session.execute(self.domain_quality_update,uv)
 | 
				
			||||||
        print(uv)
 | 
					 | 
				
			||||||
        return average_fetched_good_characters
 | 
					        return average_fetched_good_characters
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def all_domains(self,count):
 | 
					    def all_domains(self,count):
 | 
				
			||||||
@ -476,16 +492,34 @@ INSERT INTO content(
 | 
				
			|||||||
        # get all domains
 | 
					        # get all domains
 | 
				
			||||||
        rows = self.session.execute(self.domains_select)
 | 
					        rows = self.session.execute(self.domains_select)
 | 
				
			||||||
        domains = []
 | 
					        domains = []
 | 
				
			||||||
 | 
					        # Analyze third level domains
 | 
				
			||||||
 | 
					        dd = collections.defaultdict(set)
 | 
				
			||||||
 | 
					        third_count = 0
 | 
				
			||||||
        for row in rows:
 | 
					        for row in rows:
 | 
				
			||||||
            domain = row[0]
 | 
					            domain = row[0]
 | 
				
			||||||
            seen_count = row[1]
 | 
					            seen_count = row[1]
 | 
				
			||||||
            fetched_count = row[2]
 | 
					            fetched_count = row[2]
 | 
				
			||||||
            gain_ratio = row[3]
 | 
					            gain_ratio = row[3]
 | 
				
			||||||
            afg = row[4]
 | 
					            afg = row[4]
 | 
				
			||||||
            if seen_count and not fetched_count and parser.is_domain_good(domain):
 | 
					            if seen_count and not fetched_count:
 | 
				
			||||||
                domains.append((domain,0))
 | 
					                subdomains = domain.split(".")
 | 
				
			||||||
        ss = min(len(domains),count)
 | 
					                d2 = subdomains[-2] + "." + subdomains[-1]
 | 
				
			||||||
        return random.sample(domains,ss)
 | 
					                dd[d2].add(domain)
 | 
				
			||||||
 | 
					        # Select second level first
 | 
				
			||||||
 | 
					        result = []
 | 
				
			||||||
 | 
					        # then select third level
 | 
				
			||||||
 | 
					        ll = list(dd.items()) 
 | 
				
			||||||
 | 
					        random.shuffle(ll)
 | 
				
			||||||
 | 
					        domain_weight = count / len(ll) 
 | 
				
			||||||
 | 
					        for domain,subdomains in ll:
 | 
				
			||||||
 | 
					            dl = list(subdomains)
 | 
				
			||||||
 | 
					            link_weight = domain_weight / len(dl)
 | 
				
			||||||
 | 
					            random.shuffle(dl)
 | 
				
			||||||
 | 
					            for d in dl:
 | 
				
			||||||
 | 
					                r = random.random()
 | 
				
			||||||
 | 
					                if r < link_weight:
 | 
				
			||||||
 | 
					                    result.append((d,0))
 | 
				
			||||||
 | 
					        return result
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    def get_visit_links(self,domain,recent_count,old_count,random_count):
 | 
					    def get_visit_links(self,domain,recent_count,old_count,random_count):
 | 
				
			||||||
        dblinks = []
 | 
					        dblinks = []
 | 
				
			||||||
 | 
				
			|||||||
@ -113,7 +113,7 @@ class BaseParser:
 | 
				
			|||||||
            r = "Port in domain"
 | 
					            r = "Port in domain"
 | 
				
			||||||
        elif len(domain) < 4:
 | 
					        elif len(domain) < 4:
 | 
				
			||||||
            r = "Too short domain"
 | 
					            r = "Too short domain"
 | 
				
			||||||
        elif len(domain) > 50:
 | 
					        elif len(domain) > 127:
 | 
				
			||||||
            r = "Too long location"
 | 
					            r = "Too long location"
 | 
				
			||||||
        elif domain.startswith(".") or domain.endswith("."):
 | 
					        elif domain.startswith(".") or domain.endswith("."):
 | 
				
			||||||
            r = "Malformed domain"
 | 
					            r = "Malformed domain"
 | 
				
			||||||
@ -152,16 +152,8 @@ class BaseParser:
 | 
				
			|||||||
                return False
 | 
					                return False
 | 
				
			||||||
            for c in link:
 | 
					            for c in link:
 | 
				
			||||||
                if ord(c) >= 128:
 | 
					                if ord(c) >= 128:
 | 
				
			||||||
                    r = "Bad domain character"
 | 
					                    r = "Bad link character"
 | 
				
			||||||
                    break
 | 
					                    break
 | 
				
			||||||
            for p in self.skipdomains:
 | 
					 | 
				
			||||||
                if domain.endswith(p):
 | 
					 | 
				
			||||||
                    r = "Bad domain"
 | 
					 | 
				
			||||||
                    break
 | 
					 | 
				
			||||||
            if ".b-" in domain:
 | 
					 | 
				
			||||||
                r = "Bad domain"
 | 
					 | 
				
			||||||
            if len(domain) > 127:
 | 
					 | 
				
			||||||
                r = "Too long path"
 | 
					 | 
				
			||||||
            # Path
 | 
					            # Path
 | 
				
			||||||
            for t in self.skiptypes:
 | 
					            for t in self.skiptypes:
 | 
				
			||||||
                if path.lower().endswith(t):
 | 
					                if path.lower().endswith(t):
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user