diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 1e5882e..17e4553 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -18,10 +18,11 @@ import re import time import collections import math +import json LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") DOMAIN = os.getenv("SUCKER_DOMAIN","sk") -BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",100) +BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",10) CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") DBNAME=os.getenv("SUCKER_DBNAME","crawler") MINFILESIZE=300 @@ -405,7 +406,7 @@ def get_links(db,hostname,status,batch_size): outlinks.append((doc["url"],cl.classify(link))) outlinks = sorted(outlinks, key=lambda x: x[1],reverse=True) links = [l[0] for l in outlinks[0:batch_size]] - # todo remove very bad links + # todo remove very bad links from database return list(links) @@ -463,14 +464,35 @@ def link_summary(db,hostname): text_size = 0 for item in res: text_size = item["text_size_sum"] - good_document_characters = text_size / goodcount + good_document_characters = 0 + if goodcount > 0: + good_document_characters = text_size / goodcount fetch_average_characters = text_size / (goodcount + badcount) info["total_good_characters"] = text_size info["average_good_characters"] = good_document_characters info["average_fetch_characters"] = fetch_average_characters domaincol = db["domain"] - print(json.dumps(info)) - domaincol.update_one({"host":domain},{"$set":info},usert=True) + if goodcount + badcount > 100: + cl = LinkClassifier() + cl.train(db,hostname) + res = linkcol.aggregate([ + { "$match": { "status": "backlink","host":hostname } }, + { "$sample": { "size": BATCHSIZE * 100 } } + ]) + predicted_good = 0 + predicted_bad = 0 + for item in res: + cll = cl.classify(item["url"]) + if cll > 0: + predicted_good += 1 + else: + predicted_bad += 1 + predicted_good_prob = 0 + if predicted_good + predicted_bad > 0: + predicted_good_prob = predicted_good / (predicted_good + predicted_bad) + info["predicted_good_prob"] = predicted_good_prob + print(info) + domaincol.update_one({"host":hostname},{"$set":info},upsert=True) def domain_summary(db,hostname): linkcol = db["links"]