Merge branch 'master' of git.kemt.fei.tuke.sk:dano/websucker-pip
This commit is contained in:
		
						commit
						9bc2771e24
					
				| @ -15,10 +15,13 @@ import logging as LOGGER | |||||||
| import os | import os | ||||||
| import pprint | import pprint | ||||||
| import re | import re | ||||||
|  | import time | ||||||
|  | import collections | ||||||
|  | import math | ||||||
| 
 | 
 | ||||||
| LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") | LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") | ||||||
| DOMAIN = os.getenv("SUCKER_DOMAIN","sk") | DOMAIN = os.getenv("SUCKER_DOMAIN","sk") | ||||||
| BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",10) | BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",100) | ||||||
| CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") | CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") | ||||||
| DBNAME=os.getenv("SUCKER_DBNAME","crawler") | DBNAME=os.getenv("SUCKER_DBNAME","crawler") | ||||||
| MINFILESIZE=300 | MINFILESIZE=300 | ||||||
| @ -107,6 +110,7 @@ def fetch_pages(link_batch): | |||||||
|         print(link) |         print(link) | ||||||
|         final_link = link |         final_link = link | ||||||
|         response = trafilatura.fetch_url(link,decode=False) |         response = trafilatura.fetch_url(link,decode=False) | ||||||
|  |         time.sleep(2) | ||||||
|         html = None |         html = None | ||||||
|         if response is not None : |         if response is not None : | ||||||
|             good = True |             good = True | ||||||
| @ -256,6 +260,8 @@ def extract_links(link_batch,responses,hostname,rules,default_status="frontlink" | |||||||
| def index_links(db,extracted_links): | def index_links(db,extracted_links): | ||||||
|     linkcol=db["links"] |     linkcol=db["links"] | ||||||
|     for link,status in extracted_links: |     for link,status in extracted_links: | ||||||
|  |         if not is_link_good(link): | ||||||
|  |             continue | ||||||
|         doc = get_link_doc(link,status) |         doc = get_link_doc(link,status) | ||||||
|         try: |         try: | ||||||
|             linkcol.insert_one(doc) |             linkcol.insert_one(doc) | ||||||
| @ -264,63 +270,142 @@ def index_links(db,extracted_links): | |||||||
| 
 | 
 | ||||||
| def get_link_features(link): | def get_link_features(link): | ||||||
|     a, urlpath = courlan.get_host_and_path(link) |     a, urlpath = courlan.get_host_and_path(link) | ||||||
|     features = urlpath.split("/?-_") |     features = re.split("[/?&]",urlpath) | ||||||
|     if len(features) < 2: |     #features = re.split("[/?-_=]",urlpath) | ||||||
|         return None |     res = [] | ||||||
|     # drop last part |     for feature in features: | ||||||
|     features = features[:-1] |         if len(feature) < 1: | ||||||
|     return features |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def link_classifier(db,hostname,batch_size): |  | ||||||
|     res = linkcol.aggregate([ |  | ||||||
|         { "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } }, |  | ||||||
|         { "$sample": { "size": 2000 } } |  | ||||||
|     ]) |  | ||||||
|     goodcounter = collections.Counter() |  | ||||||
|     badcounter = collections.Counter() |  | ||||||
|     for item in res: |  | ||||||
|         link = res["url"] |  | ||||||
|         state = res["status"] |  | ||||||
|         cl = 0 |  | ||||||
|         if state == "good": |  | ||||||
|             cl = 1 |  | ||||||
|         features = get_link_features(link) |  | ||||||
|         if features is None: |  | ||||||
|             continue |             continue | ||||||
|         lf = len(features) |         if feature.isdigit(): | ||||||
|         for feature in features: |             feature = "<NUM>" | ||||||
|             if state == "good": |         res.append(feature) | ||||||
|                 goodcounter[feature] += 1/lf |     if len(res) < 2: | ||||||
|             else: |         return None | ||||||
|                 badcounter[feature] += 1/lf |     res = res[:-1] | ||||||
|         tf = goodcounter.keys() + bacounter.keys() |     print(res) | ||||||
|         allcounter = collections.Counter() |     return res | ||||||
|         for key in tf: |  | ||||||
|             gc = goodcounter[key] |  | ||||||
|             bc = badcounter[key] |  | ||||||
|             p = gc / (gc + bc) |  | ||||||
|             allcounter[key] = p |  | ||||||
|         return allcounter |  | ||||||
| 
 | 
 | ||||||
|  | class LinkClassifier: | ||||||
|  |     def __init__(self): | ||||||
|  | 
 | ||||||
|  |         self.goodcounter = collections.Counter() | ||||||
|  |         self.badcounter = collections.Counter() | ||||||
|  |         self.good_count = 0 | ||||||
|  |         self.bad_count = 0 | ||||||
|  |         self.alpha = 0.001 | ||||||
|  | 
 | ||||||
|  |     def train(self,db,hostname): | ||||||
|  |         linkcol = db["links"] | ||||||
|  |         res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}}) | ||||||
|  |         testset = [] | ||||||
|  |         for i,item in enumerate(res): | ||||||
|  |             link = item["url"] | ||||||
|  |             state = item["status"] | ||||||
|  |             cl = 0 | ||||||
|  |             if state == "good": | ||||||
|  |                 cl = 1 | ||||||
|  |             print(cl,state,link) | ||||||
|  |             if i % 10 == 1: | ||||||
|  |                 testset.append((link,cl)) | ||||||
|  |                 continue | ||||||
|  |             features = get_link_features(link) | ||||||
|  |             if features is None: | ||||||
|  |                 continue | ||||||
|  |             lf = len(features) | ||||||
|  |             if state == "good": | ||||||
|  |                 for feature in features: | ||||||
|  |                     self.good_count += 1 | ||||||
|  |                     self.goodcounter[feature] += 1 | ||||||
|  |             else: | ||||||
|  |                 for feature in features: | ||||||
|  |                     self.bad_count += 1 | ||||||
|  |                     self.badcounter[feature] += 1 | ||||||
|  |         self.bdictsize = len(self.badcounter) | ||||||
|  |         self.gdictsize = len(self.goodcounter) | ||||||
|  |         # eval | ||||||
|  |         gg = 0 | ||||||
|  |         for l,cl in testset: | ||||||
|  |             pcp = self.classify(l) | ||||||
|  |             r = 0 | ||||||
|  |             if pcp > 0: | ||||||
|  |                 r = 1 | ||||||
|  |             if r == cl: | ||||||
|  |                 gg += 1 | ||||||
|  |             else: | ||||||
|  |                 print("MISS",l,cl,pcp) | ||||||
|  |         print("Accuracy:") | ||||||
|  |         print(len(testset)) | ||||||
|  |         print(gg / len(testset)) | ||||||
|  | 
 | ||||||
|  |     def classify(self,link): | ||||||
|  |         features = get_link_features(link) | ||||||
|  |         res = 0 | ||||||
|  |         gp = math.log(self.good_count) - math.log(self.good_count + self.bad_count) | ||||||
|  |         bp = math.log(self.bad_count) - math.log(self.good_count + self.bad_count) | ||||||
|  |         if features is None: | ||||||
|  |             return math.exp(gp) - math.exp(bp) | ||||||
|  |         gcc = math.log(self.gdictsize * self.alpha + self.good_count) | ||||||
|  |         bcc = math.log(self.bdictsize * self.alpha + self.bad_count) | ||||||
|  |         goodprob = 0 | ||||||
|  |         badprob = 0 | ||||||
|  |         for feature in features: | ||||||
|  |             g = math.log((self.goodcounter[feature] + self.alpha)) - gcc  | ||||||
|  |             goodprob += g | ||||||
|  |             b = math.log(self.badcounter[feature] + self.alpha) - bcc | ||||||
|  |             badprob += b | ||||||
|  |             print(feature,g,b) | ||||||
|  |         if (goodprob + gp)  > (badprob + bp): | ||||||
|  |         #if goodprob > badprob: | ||||||
|  |             res = 1 | ||||||
|  |         pa = math.exp(goodprob + gp) | ||||||
|  |         pb = math.exp(badprob + bp) | ||||||
|  |         return pa - pb | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_links(db,hostname,status,batch_size): | def get_links(db,hostname,status,batch_size): | ||||||
|     linkcol = db["links"] |     linkcol = db["links"] | ||||||
|     #res  = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size) |     # count downloaded links | ||||||
|     # get random links |  | ||||||
|     res = linkcol.aggregate([ |     res = linkcol.aggregate([ | ||||||
|         { "$match": { "status": status,"host":hostname } }, |         { "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } }, | ||||||
|         { "$sample": { "size": batch_size } } |         {"$group":{"_id":None, | ||||||
|  |                    "count":{"$count":{}}, | ||||||
|  |                    } | ||||||
|  |          }, | ||||||
|     ]) |     ]) | ||||||
|     links = set() |     links = set() | ||||||
|     for i,doc in enumerate(res): |     out = list(res) | ||||||
|         #print(">>>>>" + status) |     if len(out) == 0: | ||||||
|         #print(doc); |         return list() | ||||||
|         links.add(doc["url"]) |     if out[0]["count"] < 200: | ||||||
|         if i >= batch_size: |     #res  = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size) | ||||||
|             break |         # get random links | ||||||
|  |         res = linkcol.aggregate([ | ||||||
|  |             { "$match": { "status": status,"host":hostname } }, | ||||||
|  |             { "$sample": { "size": batch_size } } | ||||||
|  |         ]) | ||||||
|  |         for i,doc in enumerate(res): | ||||||
|  |             #print(">>>>>" + status) | ||||||
|  |             #print(doc); | ||||||
|  |             links.add(doc["url"]) | ||||||
|  |             if i >= batch_size: | ||||||
|  |                 break | ||||||
|  |     else: | ||||||
|  |         cl = LinkClassifier() | ||||||
|  |         cl.train(db,hostname) | ||||||
|  |         res = linkcol.aggregate([ | ||||||
|  |             { "$match": { "status": status,"host":hostname } }, | ||||||
|  |             { "$sample": { "size": batch_size * 100 } } | ||||||
|  |         ]) | ||||||
|  |         outlinks = [] | ||||||
|  |         for i,doc in enumerate(res): | ||||||
|  |             #print(">>>>>" + status) | ||||||
|  |             #print(doc); | ||||||
|  |             link = doc["url"] | ||||||
|  |             outlinks.append((doc["url"],cl.classify(link))) | ||||||
|  |         outlinks = sorted(outlinks, key=lambda x: x[1],reverse=True) | ||||||
|  |         links = [l[0] for l in outlinks[0:batch_size]] | ||||||
|  |         # todo remove very bad links | ||||||
|     return list(links) |     return list(links) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -352,8 +437,19 @@ def link_summary(db,hostname): | |||||||
|         {"$match":{"host":hostname}}, |         {"$match":{"host":hostname}}, | ||||||
|         {"$group":{"_id":"$status","count":{"$sum":1}}}, |         {"$group":{"_id":"$status","count":{"$sum":1}}}, | ||||||
|     ]) |     ]) | ||||||
|  |     badcount = 0 | ||||||
|  |     goodcount = 0 | ||||||
|  |     out = ["good","frontlink","backlink"] | ||||||
|  |     info = {} | ||||||
|     for item in res: |     for item in res: | ||||||
|         print(item) |         if item["_id"] not in out: | ||||||
|  |             badcount += item["count"] | ||||||
|  |         if item["_id"] == "good": | ||||||
|  |             goodcount = item["count"] | ||||||
|  |         info[item["_id"]] = item["count"] | ||||||
|  |     good_prob = goodcount / (goodcount + badcount) | ||||||
|  |     info["good_prob"] = good_prob | ||||||
|  |     info["bad_documents"] = badcount | ||||||
|     print(">>>Domain Content") |     print(">>>Domain Content") | ||||||
|     contentcol = db["content"] |     contentcol = db["content"] | ||||||
|     res = contentcol.aggregate([ |     res = contentcol.aggregate([ | ||||||
| @ -364,8 +460,17 @@ def link_summary(db,hostname): | |||||||
|                    } |                    } | ||||||
|          }, |          }, | ||||||
|     ]) |     ]) | ||||||
|  |     text_size = 0 | ||||||
|     for item in res: |     for item in res: | ||||||
|         print(item) |         text_size = item["text_size_sum"] | ||||||
|  |     good_document_characters = text_size / goodcount | ||||||
|  |     fetch_average_characters = text_size / (goodcount +  badcount) | ||||||
|  |     info["total_good_characters"] = text_size | ||||||
|  |     info["average_good_characters"] = good_document_characters | ||||||
|  |     info["average_fetch_characters"] = fetch_average_characters | ||||||
|  |     domaincol = db["domain"] | ||||||
|  |     print(json.dumps(info)) | ||||||
|  |     domaincol.update_one({"host":domain},{"$set":info},usert=True) | ||||||
| 
 | 
 | ||||||
| def domain_summary(db,hostname): | def domain_summary(db,hostname): | ||||||
|     linkcol = db["links"] |     linkcol = db["links"] | ||||||
| @ -395,6 +500,8 @@ def createdb(): | |||||||
|     contentcol.create_index("host") |     contentcol.create_index("host") | ||||||
|     htmlcol = db["html"] |     htmlcol = db["html"] | ||||||
|     htmlcol.create_index("url",unique=True) |     htmlcol.create_index("url",unique=True) | ||||||
|  |     domaincol = db["domains"] | ||||||
|  |     domaincol.create_index("host",unique=True) | ||||||
| 
 | 
 | ||||||
| @cli.command() | @cli.command() | ||||||
| @click.argument("link") | @click.argument("link") | ||||||
| @ -427,6 +534,14 @@ def externaldomains(link): | |||||||
|     for d in domains: |     for d in domains: | ||||||
|         print(d) |         print(d) | ||||||
| 
 | 
 | ||||||
|  | @cli.command() | ||||||
|  | @click.argument("start_link") | ||||||
|  | def classify(start_link): | ||||||
|  |     myclient = pymongo.MongoClient(CONNECTION) | ||||||
|  |     db=myclient[DBNAME] | ||||||
|  |     start_link,hostname = courlan.check_url(start_link) | ||||||
|  |     cl = LinkClassifier() | ||||||
|  |     cl.train(db,hostname) | ||||||
| 
 | 
 | ||||||
| @cli.command() | @cli.command() | ||||||
| @click.argument("start_link") | @click.argument("start_link") | ||||||
| @ -443,15 +558,19 @@ def visit(start_link): | |||||||
|         print("Fetching sitemap links") |         print("Fetching sitemap links") | ||||||
|         sitemap_links = fetch_sitemap_links(start_link) |         sitemap_links = fetch_sitemap_links(start_link) | ||||||
|         index_links(db,sitemap_links) |         index_links(db,sitemap_links) | ||||||
|     links.append(start_link) |         links = get_links(db,hostname,"frontlink",batch_size) | ||||||
|  |     links.insert(0,start_link) | ||||||
|  |     if len(links) < batch_size: | ||||||
|  |         back_links = get_links(db,hostname,"backlink",batch_size - len(links)) | ||||||
|  |         links += back_links | ||||||
| 
 | 
 | ||||||
|     print("Processing frontlinks") |     print("Processing links") | ||||||
|     rules = fetch_robot(hostname) |     rules = fetch_robot(hostname) | ||||||
|     process_links(db,hostname,"frontlink",links,rules) |     responses = fetch_pages(links) | ||||||
|     print("Getting  backlinks") |     extracted_pages = extract_pages(links,responses) | ||||||
|     back_links = get_links(db,hostname,"backlink",batch_size) |     extracted_links = extract_links(links,responses,hostname,rules,"backlink") | ||||||
|     print("Processing backlinks") |     index_links(db,extracted_links) | ||||||
|     process_links(db,hostname,"backlink",back_links,rules=rules) |     index_pages(db,hostname,extracted_pages) | ||||||
|     link_summary(db,hostname) |     link_summary(db,hostname) | ||||||
| 
 | 
 | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user