Merge branch 'master' of git.kemt.fei.tuke.sk:dano/websucker-pip
This commit is contained in:
		
						commit
						9bc2771e24
					
				| @ -15,10 +15,13 @@ import logging as LOGGER | ||||
| import os | ||||
| import pprint | ||||
| import re | ||||
| import time | ||||
| import collections | ||||
| import math | ||||
| 
 | ||||
| LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") | ||||
| DOMAIN = os.getenv("SUCKER_DOMAIN","sk") | ||||
| BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",10) | ||||
| BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",100) | ||||
| CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") | ||||
| DBNAME=os.getenv("SUCKER_DBNAME","crawler") | ||||
| MINFILESIZE=300 | ||||
| @ -107,6 +110,7 @@ def fetch_pages(link_batch): | ||||
|         print(link) | ||||
|         final_link = link | ||||
|         response = trafilatura.fetch_url(link,decode=False) | ||||
|         time.sleep(2) | ||||
|         html = None | ||||
|         if response is not None : | ||||
|             good = True | ||||
| @ -256,6 +260,8 @@ def extract_links(link_batch,responses,hostname,rules,default_status="frontlink" | ||||
| def index_links(db,extracted_links): | ||||
|     linkcol=db["links"] | ||||
|     for link,status in extracted_links: | ||||
|         if not is_link_good(link): | ||||
|             continue | ||||
|         doc = get_link_doc(link,status) | ||||
|         try: | ||||
|             linkcol.insert_one(doc) | ||||
| @ -264,63 +270,142 @@ def index_links(db,extracted_links): | ||||
| 
 | ||||
| def get_link_features(link): | ||||
|     a, urlpath = courlan.get_host_and_path(link) | ||||
|     features = urlpath.split("/?-_") | ||||
|     if len(features) < 2: | ||||
|     features = re.split("[/?&]",urlpath) | ||||
|     #features = re.split("[/?-_=]",urlpath) | ||||
|     res = [] | ||||
|     for feature in features: | ||||
|         if len(feature) < 1: | ||||
|             continue | ||||
|         if feature.isdigit(): | ||||
|             feature = "<NUM>" | ||||
|         res.append(feature) | ||||
|     if len(res) < 2: | ||||
|         return None | ||||
|     # drop last part | ||||
|     features = features[:-1] | ||||
|     return features | ||||
|     res = res[:-1] | ||||
|     print(res) | ||||
|     return res | ||||
| 
 | ||||
| class LinkClassifier: | ||||
|     def __init__(self): | ||||
| 
 | ||||
| def link_classifier(db,hostname,batch_size): | ||||
|     res = linkcol.aggregate([ | ||||
|         { "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } }, | ||||
|         { "$sample": { "size": 2000 } } | ||||
|     ]) | ||||
|     goodcounter = collections.Counter() | ||||
|     badcounter = collections.Counter() | ||||
|     for item in res: | ||||
|         link = res["url"] | ||||
|         state = res["status"] | ||||
|         self.goodcounter = collections.Counter() | ||||
|         self.badcounter = collections.Counter() | ||||
|         self.good_count = 0 | ||||
|         self.bad_count = 0 | ||||
|         self.alpha = 0.001 | ||||
| 
 | ||||
|     def train(self,db,hostname): | ||||
|         linkcol = db["links"] | ||||
|         res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}}) | ||||
|         testset = [] | ||||
|         for i,item in enumerate(res): | ||||
|             link = item["url"] | ||||
|             state = item["status"] | ||||
|             cl = 0 | ||||
|             if state == "good": | ||||
|                 cl = 1 | ||||
|             print(cl,state,link) | ||||
|             if i % 10 == 1: | ||||
|                 testset.append((link,cl)) | ||||
|                 continue | ||||
|             features = get_link_features(link) | ||||
|             if features is None: | ||||
|                 continue | ||||
|             lf = len(features) | ||||
|         for feature in features: | ||||
|             if state == "good": | ||||
|                 goodcounter[feature] += 1/lf | ||||
|                 for feature in features: | ||||
|                     self.good_count += 1 | ||||
|                     self.goodcounter[feature] += 1 | ||||
|             else: | ||||
|                 badcounter[feature] += 1/lf | ||||
|         tf = goodcounter.keys() + bacounter.keys() | ||||
|         allcounter = collections.Counter() | ||||
|         for key in tf: | ||||
|             gc = goodcounter[key] | ||||
|             bc = badcounter[key] | ||||
|             p = gc / (gc + bc) | ||||
|             allcounter[key] = p | ||||
|         return allcounter | ||||
|                 for feature in features: | ||||
|                     self.bad_count += 1 | ||||
|                     self.badcounter[feature] += 1 | ||||
|         self.bdictsize = len(self.badcounter) | ||||
|         self.gdictsize = len(self.goodcounter) | ||||
|         # eval | ||||
|         gg = 0 | ||||
|         for l,cl in testset: | ||||
|             pcp = self.classify(l) | ||||
|             r = 0 | ||||
|             if pcp > 0: | ||||
|                 r = 1 | ||||
|             if r == cl: | ||||
|                 gg += 1 | ||||
|             else: | ||||
|                 print("MISS",l,cl,pcp) | ||||
|         print("Accuracy:") | ||||
|         print(len(testset)) | ||||
|         print(gg / len(testset)) | ||||
| 
 | ||||
|     def classify(self,link): | ||||
|         features = get_link_features(link) | ||||
|         res = 0 | ||||
|         gp = math.log(self.good_count) - math.log(self.good_count + self.bad_count) | ||||
|         bp = math.log(self.bad_count) - math.log(self.good_count + self.bad_count) | ||||
|         if features is None: | ||||
|             return math.exp(gp) - math.exp(bp) | ||||
|         gcc = math.log(self.gdictsize * self.alpha + self.good_count) | ||||
|         bcc = math.log(self.bdictsize * self.alpha + self.bad_count) | ||||
|         goodprob = 0 | ||||
|         badprob = 0 | ||||
|         for feature in features: | ||||
|             g = math.log((self.goodcounter[feature] + self.alpha)) - gcc  | ||||
|             goodprob += g | ||||
|             b = math.log(self.badcounter[feature] + self.alpha) - bcc | ||||
|             badprob += b | ||||
|             print(feature,g,b) | ||||
|         if (goodprob + gp)  > (badprob + bp): | ||||
|         #if goodprob > badprob: | ||||
|             res = 1 | ||||
|         pa = math.exp(goodprob + gp) | ||||
|         pb = math.exp(badprob + bp) | ||||
|         return pa - pb | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| def get_links(db,hostname,status,batch_size): | ||||
|     linkcol = db["links"] | ||||
|     # count downloaded links | ||||
|     res = linkcol.aggregate([ | ||||
|         { "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } }, | ||||
|         {"$group":{"_id":None, | ||||
|                    "count":{"$count":{}}, | ||||
|                    } | ||||
|          }, | ||||
|     ]) | ||||
|     links = set() | ||||
|     out = list(res) | ||||
|     if len(out) == 0: | ||||
|         return list() | ||||
|     if out[0]["count"] < 200: | ||||
|     #res  = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size) | ||||
|         # get random links | ||||
|         res = linkcol.aggregate([ | ||||
|             { "$match": { "status": status,"host":hostname } }, | ||||
|             { "$sample": { "size": batch_size } } | ||||
|         ]) | ||||
|     links = set() | ||||
|         for i,doc in enumerate(res): | ||||
|             #print(">>>>>" + status) | ||||
|             #print(doc); | ||||
|             links.add(doc["url"]) | ||||
|             if i >= batch_size: | ||||
|                 break | ||||
|     else: | ||||
|         cl = LinkClassifier() | ||||
|         cl.train(db,hostname) | ||||
|         res = linkcol.aggregate([ | ||||
|             { "$match": { "status": status,"host":hostname } }, | ||||
|             { "$sample": { "size": batch_size * 100 } } | ||||
|         ]) | ||||
|         outlinks = [] | ||||
|         for i,doc in enumerate(res): | ||||
|             #print(">>>>>" + status) | ||||
|             #print(doc); | ||||
|             link = doc["url"] | ||||
|             outlinks.append((doc["url"],cl.classify(link))) | ||||
|         outlinks = sorted(outlinks, key=lambda x: x[1],reverse=True) | ||||
|         links = [l[0] for l in outlinks[0:batch_size]] | ||||
|         # todo remove very bad links | ||||
|     return list(links) | ||||
| 
 | ||||
| 
 | ||||
| @ -352,8 +437,19 @@ def link_summary(db,hostname): | ||||
|         {"$match":{"host":hostname}}, | ||||
|         {"$group":{"_id":"$status","count":{"$sum":1}}}, | ||||
|     ]) | ||||
|     badcount = 0 | ||||
|     goodcount = 0 | ||||
|     out = ["good","frontlink","backlink"] | ||||
|     info = {} | ||||
|     for item in res: | ||||
|         print(item) | ||||
|         if item["_id"] not in out: | ||||
|             badcount += item["count"] | ||||
|         if item["_id"] == "good": | ||||
|             goodcount = item["count"] | ||||
|         info[item["_id"]] = item["count"] | ||||
|     good_prob = goodcount / (goodcount + badcount) | ||||
|     info["good_prob"] = good_prob | ||||
|     info["bad_documents"] = badcount | ||||
|     print(">>>Domain Content") | ||||
|     contentcol = db["content"] | ||||
|     res = contentcol.aggregate([ | ||||
| @ -364,8 +460,17 @@ def link_summary(db,hostname): | ||||
|                    } | ||||
|          }, | ||||
|     ]) | ||||
|     text_size = 0 | ||||
|     for item in res: | ||||
|         print(item) | ||||
|         text_size = item["text_size_sum"] | ||||
|     good_document_characters = text_size / goodcount | ||||
|     fetch_average_characters = text_size / (goodcount +  badcount) | ||||
|     info["total_good_characters"] = text_size | ||||
|     info["average_good_characters"] = good_document_characters | ||||
|     info["average_fetch_characters"] = fetch_average_characters | ||||
|     domaincol = db["domain"] | ||||
|     print(json.dumps(info)) | ||||
|     domaincol.update_one({"host":domain},{"$set":info},usert=True) | ||||
| 
 | ||||
| def domain_summary(db,hostname): | ||||
|     linkcol = db["links"] | ||||
| @ -395,6 +500,8 @@ def createdb(): | ||||
|     contentcol.create_index("host") | ||||
|     htmlcol = db["html"] | ||||
|     htmlcol.create_index("url",unique=True) | ||||
|     domaincol = db["domains"] | ||||
|     domaincol.create_index("host",unique=True) | ||||
| 
 | ||||
| @cli.command() | ||||
| @click.argument("link") | ||||
| @ -427,6 +534,14 @@ def externaldomains(link): | ||||
|     for d in domains: | ||||
|         print(d) | ||||
| 
 | ||||
| @cli.command() | ||||
| @click.argument("start_link") | ||||
| def classify(start_link): | ||||
|     myclient = pymongo.MongoClient(CONNECTION) | ||||
|     db=myclient[DBNAME] | ||||
|     start_link,hostname = courlan.check_url(start_link) | ||||
|     cl = LinkClassifier() | ||||
|     cl.train(db,hostname) | ||||
| 
 | ||||
| @cli.command() | ||||
| @click.argument("start_link") | ||||
| @ -443,15 +558,19 @@ def visit(start_link): | ||||
|         print("Fetching sitemap links") | ||||
|         sitemap_links = fetch_sitemap_links(start_link) | ||||
|         index_links(db,sitemap_links) | ||||
|     links.append(start_link) | ||||
|         links = get_links(db,hostname,"frontlink",batch_size) | ||||
|     links.insert(0,start_link) | ||||
|     if len(links) < batch_size: | ||||
|         back_links = get_links(db,hostname,"backlink",batch_size - len(links)) | ||||
|         links += back_links | ||||
| 
 | ||||
|     print("Processing frontlinks") | ||||
|     print("Processing links") | ||||
|     rules = fetch_robot(hostname) | ||||
|     process_links(db,hostname,"frontlink",links,rules) | ||||
|     print("Getting  backlinks") | ||||
|     back_links = get_links(db,hostname,"backlink",batch_size) | ||||
|     print("Processing backlinks") | ||||
|     process_links(db,hostname,"backlink",back_links,rules=rules) | ||||
|     responses = fetch_pages(links) | ||||
|     extracted_pages = extract_pages(links,responses) | ||||
|     extracted_links = extract_links(links,responses,hostname,rules,"backlink") | ||||
|     index_links(db,extracted_links) | ||||
|     index_pages(db,hostname,extracted_pages) | ||||
|     link_summary(db,hostname) | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user