1 changed files with 6 additions and 28 deletions
--- a/mongo/mongocwarler.py
+++ b/mongo/mongocwarler.py
@ -18,11 +18,10 @@ import re
 import time
 import collections
 import math
 import json
 LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
 DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
-BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",10)
+BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",100)
 CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
 DBNAME=os.getenv("SUCKER_DBNAME","crawler")
 MINFILESIZE=300
@ -56,7 +55,7 @@ def calculate_checksums(text):
    for c in text:
        cv = ord(c)
        sz += 1
-        if cv > 64:  # ignore non-ascii
+        if cv > 64:
            hval += (hval << 3) + cv
            zv = hval >> 31
            hval &= 0x7fffffff
@ -406,7 +405,7 @@ def get_links(db,hostname,status,batch_size):
            outlinks.append((doc["url"],cl.classify(link)))
        outlinks = sorted(outlinks, key=lambda x: x[1],reverse=True)
        links = [l[0] for l in outlinks[0:batch_size]]
-        # todo remove very bad links from database
+        # todo remove very bad links
    return list(links)
@ -464,35 +463,14 @@ def link_summary(db,hostname):
    text_size = 0
    for item in res:
        text_size = item["text_size_sum"]
-    good_document_characters = 0
+    good_document_characters = text_size / goodcount
    if goodcount > 0:
        good_document_characters = text_size / goodcount
    fetch_average_characters = text_size / (goodcount +  badcount)
    info["total_good_characters"] = text_size
    info["average_good_characters"] = good_document_characters
    info["average_fetch_characters"] = fetch_average_characters
    domaincol = db["domain"]
-    if goodcount + badcount > 100: 
+    print(json.dumps(info))
-        cl = LinkClassifier()
+    domaincol.update_one({"host":domain},{"$set":info},usert=True)
        cl.train(db,hostname)
        res = linkcol.aggregate([
            { "$match": { "status": "backlink","host":hostname } },
            { "$sample": { "size": BATCHSIZE * 100 } }
        ])
        predicted_good = 0
        predicted_bad = 0
        for item in res:
            cll = cl.classify(item["url"])
            if cll > 0:
                predicted_good += 1
            else:
                predicted_bad += 1
        predicted_good_prob = 0
        if predicted_good + predicted_bad > 0:
            predicted_good_prob = predicted_good / (predicted_good + predicted_bad)
        info["predicted_good_prob"] = predicted_good_prob
    print(info)
    domaincol.update_one({"host":hostname},{"$set":info},upsert=True)
 def domain_summary(db,hostname):
    linkcol = db["links"]