diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index f22903e..db781a0 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -19,6 +19,7 @@ import time import collections import math import random +import hashlib LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") DOMAIN = os.getenv("SUCKER_DOMAIN","sk") @@ -32,17 +33,15 @@ CHECK_PARAGRAPH_SIZE=150 TEXT_TRASH_SIZE=200 TEXT_TRASH_RATIO=0.6 -def put_queue(db,channel,message): - queuecol = db["queue"] - queuecol.insert_one({"channel":channel,"message":message,"created_at":datetime.utcnow(),"started_at":None}) - -def reserve_queue(db,channel,message): - queuecol = db["queue"] - r = queuecol.find_one_and_delete({"channel":channel},sort={"created_at":-1}) - -def delete_queue(db,channel): - queuecol = db["queue"] - pass +def split_train(res): + trainset = [] + testset = [] + for i,item in enumerate(res): + if i % 10 == 0: + testset.append(item) + else: + trainset.append(item) + return trainset,testset def calculate_checksums(text): """ @@ -181,6 +180,7 @@ def index_pages(db,hostname,extracted_pages): text = doc["text"] checksums,sizes = calculate_checksums(text) doc["text_size"] = len(text) + doc["text_md5"] = hashlib.md5(text.encode("utf8")).hexdigest() doc["paragraph_checksums"] = checksums doc["paragraph_sizes"] = sizes goodsz = sum(sizes) @@ -209,6 +209,7 @@ def index_pages(db,hostname,extracted_pages): htdoc = get_link_doc(link,state) htdoc["html"] = html htdoc["html_size"] = len(html) + htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest() # can be revisited - upsert del htdoc["url"] htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) @@ -296,7 +297,6 @@ class LinkClassifier: self.alpha = 0.001 def train(self,links): - testset = [] for i,item in enumerate(links): link = item["url"] state = item["status"] @@ -304,9 +304,6 @@ class LinkClassifier: if state == "good": cl = 1 print(cl,state,link) - if i % 10 == 1: - testset.append((link,cl)) - continue features = get_link_features(link) if features is None: continue @@ -321,9 +318,15 @@ class LinkClassifier: self.badcounter[feature] += 1 self.bdictsize = len(self.badcounter) self.gdictsize = len(self.goodcounter) + + def test(self,testset): # eval gg = 0 - for l,cl in testset: + for item in testset: + l = item["url"] + cl = 0 + if item["status"] == "good": + cl = 1 pcp = self.classify(l) r = 0 if pcp > 0: @@ -339,7 +342,7 @@ class LinkClassifier: return acc def classify(self,link): - if self.good_count + self.bad_count == 0: + if self.good_count == 0 or self.bad_count == 0: return random.uniform(-0.1,0.1) features = get_link_features(link) res = 0 @@ -352,17 +355,14 @@ class LinkClassifier: goodprob = 0 badprob = 0 for feature in features: - g = math.log((self.goodcounter[feature] + self.alpha)) - gcc + g = math.log((self.goodcounter[feature] + self.alpha)) - gcc goodprob += g b = math.log(self.badcounter[feature] + self.alpha) - bcc badprob += b print(feature,g,b) - if (goodprob + gp) > (badprob + bp): - #if goodprob > badprob: - res = 1 pa = math.exp(goodprob + gp) pb = math.exp(badprob + bp) - return pa - pb + return pa - pb #+ random.uniform(-0.001,0.001) def get_links(db,hostname,status,batch_size): @@ -445,8 +445,9 @@ def link_summary(db,hostname): info["average_good_characters"] = good_document_characters info["average_fetch_characters"] = fetch_average_characters domaincol = db["domain"] - print(info) domaincol.update_one({"host":hostname},{"$set":info},upsert=True) + res = domaincol.find_one({"host":hostname}) + print(res) def sample_links(db,hostname,status,batch_size): print("Getting backlinks") @@ -455,11 +456,12 @@ def sample_links(db,hostname,status,batch_size): cl = LinkClassifier() crawled_links = list(res) crawled_count = len(crawled_links) - min_train_size = 200 prediction_accuracy = 0 - if crawled_count > min_train_size: + if crawled_count > 200: # train on crawled links - prediction_accuracy = cl.train(crawled_links) + trainset,testset = split_train(crawled_links) + cl.train(trainset) + prediction_accuracy = cl.test(testset) sample_set_size = 10000 res = linkcol.find({"host":hostname,"status": status},limit = sample_set_size) sample_links = [] @@ -467,12 +469,15 @@ def sample_links(db,hostname,status,batch_size): for item in res: for item in res: cll = cl.classify(item["url"]) + cll += random.uniform(-0.1,0.1) sample_links.append((item["url"],cll)) if cll > 0: predicted_good += 1 # TODO frontlinks are not unique! sample_links.sort(key=lambda x: x[1],reverse=True) - predicted_good_prob = predicted_good / len(sample_links) + predicted_good_prob = 0 + if len(sample_links) > 0: + predicted_good_prob = predicted_good / len(sample_links) domaincol = db["domain"] info = { "predicted_good_prob":predicted_good_prob, @@ -507,11 +512,13 @@ def createdb(): linkcol.create_index("url",unique=True) linkcol.create_index("host") contentcol = db["content"] - contentcol.create_index("url",unique=True) + contentcol.create_index("url") + contentcol.create_index("text_md5",unique=True) #contentcol.create_index({"paragraph_checksums":1}) contentcol.create_index("host") htmlcol = db["html"] - htmlcol.create_index("url",unique=True) + htmlcol.create_index("url") + htmlcol.create_index("html_md5",unique=True) domaincol = db["domains"] domaincol.create_index("host",unique=True) @@ -553,7 +560,12 @@ def classify(start_link): db=myclient[DBNAME] start_link,hostname = courlan.check_url(start_link) cl = LinkClassifier() - cl.train(db,hostname) + linkcol = db["links"] + res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}}) + trainset, testset = split_train(res) + + cl.train(trainset) + cl.test(testset) @cli.command() @click.argument("start_link") diff --git a/mongo/mongoindexer.py b/mongo/mongoindexer.py index 3b5f6e5..323e321 100644 --- a/mongo/mongoindexer.py +++ b/mongo/mongoindexer.py @@ -8,3 +8,18 @@ mycol = mydb["customers"] mydict = {"text":"ahoj svet"} x = mycol.insert_one(mydict) + +def createdb(): + myclient = pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] + linkcol = db["links"] + linkcol.create_index("url",unique=True) + linkcol.create_index("host") + contentcol = db["content"] + contentcol.create_index("url",unique=True) + #contentcol.create_index({"paragraph_checksums":1}) + contentcol.create_index("host") + htmlcol = db["html"] + htmlcol.create_index("url") + domaincol = db["domains"] + domaincol.create_index("host",unique=True)