This commit is contained in:
Daniel Hládek 2023-04-05 05:44:09 +02:00
parent 9bc2771e24
commit deced51d48

View File

@ -18,10 +18,11 @@ import re
import time import time
import collections import collections
import math import math
import json
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
DOMAIN = os.getenv("SUCKER_DOMAIN","sk") DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",100) BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",10)
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
DBNAME=os.getenv("SUCKER_DBNAME","crawler") DBNAME=os.getenv("SUCKER_DBNAME","crawler")
MINFILESIZE=300 MINFILESIZE=300
@ -405,7 +406,7 @@ def get_links(db,hostname,status,batch_size):
outlinks.append((doc["url"],cl.classify(link))) outlinks.append((doc["url"],cl.classify(link)))
outlinks = sorted(outlinks, key=lambda x: x[1],reverse=True) outlinks = sorted(outlinks, key=lambda x: x[1],reverse=True)
links = [l[0] for l in outlinks[0:batch_size]] links = [l[0] for l in outlinks[0:batch_size]]
# todo remove very bad links # todo remove very bad links from database
return list(links) return list(links)
@ -463,14 +464,35 @@ def link_summary(db,hostname):
text_size = 0 text_size = 0
for item in res: for item in res:
text_size = item["text_size_sum"] text_size = item["text_size_sum"]
good_document_characters = 0
if goodcount > 0:
good_document_characters = text_size / goodcount good_document_characters = text_size / goodcount
fetch_average_characters = text_size / (goodcount + badcount) fetch_average_characters = text_size / (goodcount + badcount)
info["total_good_characters"] = text_size info["total_good_characters"] = text_size
info["average_good_characters"] = good_document_characters info["average_good_characters"] = good_document_characters
info["average_fetch_characters"] = fetch_average_characters info["average_fetch_characters"] = fetch_average_characters
domaincol = db["domain"] domaincol = db["domain"]
print(json.dumps(info)) if goodcount + badcount > 100:
domaincol.update_one({"host":domain},{"$set":info},usert=True) cl = LinkClassifier()
cl.train(db,hostname)
res = linkcol.aggregate([
{ "$match": { "status": "backlink","host":hostname } },
{ "$sample": { "size": BATCHSIZE * 100 } }
])
predicted_good = 0
predicted_bad = 0
for item in res:
cll = cl.classify(item["url"])
if cll > 0:
predicted_good += 1
else:
predicted_bad += 1
predicted_good_prob = 0
if predicted_good + predicted_bad > 0:
predicted_good_prob = predicted_good / (predicted_good + predicted_bad)
info["predicted_good_prob"] = predicted_good_prob
print(info)
domaincol.update_one({"host":hostname},{"$set":info},upsert=True)
def domain_summary(db,hostname): def domain_summary(db,hostname):
linkcol = db["links"] linkcol = db["links"]