This commit is contained in:
Daniel Hládek 2023-04-06 12:15:33 +02:00
parent a26613ebb1
commit 9a9e8da4cf
2 changed files with 57 additions and 30 deletions

View File

@ -19,6 +19,7 @@ import time
import collections import collections
import math import math
import random import random
import hashlib
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
DOMAIN = os.getenv("SUCKER_DOMAIN","sk") DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
@ -32,17 +33,15 @@ CHECK_PARAGRAPH_SIZE=150
TEXT_TRASH_SIZE=200 TEXT_TRASH_SIZE=200
TEXT_TRASH_RATIO=0.6 TEXT_TRASH_RATIO=0.6
def put_queue(db,channel,message): def split_train(res):
queuecol = db["queue"] trainset = []
queuecol.insert_one({"channel":channel,"message":message,"created_at":datetime.utcnow(),"started_at":None}) testset = []
for i,item in enumerate(res):
def reserve_queue(db,channel,message): if i % 10 == 0:
queuecol = db["queue"] testset.append(item)
r = queuecol.find_one_and_delete({"channel":channel},sort={"created_at":-1}) else:
trainset.append(item)
def delete_queue(db,channel): return trainset,testset
queuecol = db["queue"]
pass
def calculate_checksums(text): def calculate_checksums(text):
""" """
@ -181,6 +180,7 @@ def index_pages(db,hostname,extracted_pages):
text = doc["text"] text = doc["text"]
checksums,sizes = calculate_checksums(text) checksums,sizes = calculate_checksums(text)
doc["text_size"] = len(text) doc["text_size"] = len(text)
doc["text_md5"] = hashlib.md5(text.encode("utf8")).hexdigest()
doc["paragraph_checksums"] = checksums doc["paragraph_checksums"] = checksums
doc["paragraph_sizes"] = sizes doc["paragraph_sizes"] = sizes
goodsz = sum(sizes) goodsz = sum(sizes)
@ -209,6 +209,7 @@ def index_pages(db,hostname,extracted_pages):
htdoc = get_link_doc(link,state) htdoc = get_link_doc(link,state)
htdoc["html"] = html htdoc["html"] = html
htdoc["html_size"] = len(html) htdoc["html_size"] = len(html)
htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest()
# can be revisited - upsert # can be revisited - upsert
del htdoc["url"] del htdoc["url"]
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
@ -296,7 +297,6 @@ class LinkClassifier:
self.alpha = 0.001 self.alpha = 0.001
def train(self,links): def train(self,links):
testset = []
for i,item in enumerate(links): for i,item in enumerate(links):
link = item["url"] link = item["url"]
state = item["status"] state = item["status"]
@ -304,9 +304,6 @@ class LinkClassifier:
if state == "good": if state == "good":
cl = 1 cl = 1
print(cl,state,link) print(cl,state,link)
if i % 10 == 1:
testset.append((link,cl))
continue
features = get_link_features(link) features = get_link_features(link)
if features is None: if features is None:
continue continue
@ -321,9 +318,15 @@ class LinkClassifier:
self.badcounter[feature] += 1 self.badcounter[feature] += 1
self.bdictsize = len(self.badcounter) self.bdictsize = len(self.badcounter)
self.gdictsize = len(self.goodcounter) self.gdictsize = len(self.goodcounter)
def test(self,testset):
# eval # eval
gg = 0 gg = 0
for l,cl in testset: for item in testset:
l = item["url"]
cl = 0
if item["status"] == "good":
cl = 1
pcp = self.classify(l) pcp = self.classify(l)
r = 0 r = 0
if pcp > 0: if pcp > 0:
@ -339,7 +342,7 @@ class LinkClassifier:
return acc return acc
def classify(self,link): def classify(self,link):
if self.good_count + self.bad_count == 0: if self.good_count == 0 or self.bad_count == 0:
return random.uniform(-0.1,0.1) return random.uniform(-0.1,0.1)
features = get_link_features(link) features = get_link_features(link)
res = 0 res = 0
@ -352,17 +355,14 @@ class LinkClassifier:
goodprob = 0 goodprob = 0
badprob = 0 badprob = 0
for feature in features: for feature in features:
g = math.log((self.goodcounter[feature] + self.alpha)) - gcc g = math.log((self.goodcounter[feature] + self.alpha)) - gcc
goodprob += g goodprob += g
b = math.log(self.badcounter[feature] + self.alpha) - bcc b = math.log(self.badcounter[feature] + self.alpha) - bcc
badprob += b badprob += b
print(feature,g,b) print(feature,g,b)
if (goodprob + gp) > (badprob + bp):
#if goodprob > badprob:
res = 1
pa = math.exp(goodprob + gp) pa = math.exp(goodprob + gp)
pb = math.exp(badprob + bp) pb = math.exp(badprob + bp)
return pa - pb return pa - pb #+ random.uniform(-0.001,0.001)
def get_links(db,hostname,status,batch_size): def get_links(db,hostname,status,batch_size):
@ -445,8 +445,9 @@ def link_summary(db,hostname):
info["average_good_characters"] = good_document_characters info["average_good_characters"] = good_document_characters
info["average_fetch_characters"] = fetch_average_characters info["average_fetch_characters"] = fetch_average_characters
domaincol = db["domain"] domaincol = db["domain"]
print(info)
domaincol.update_one({"host":hostname},{"$set":info},upsert=True) domaincol.update_one({"host":hostname},{"$set":info},upsert=True)
res = domaincol.find_one({"host":hostname})
print(res)
def sample_links(db,hostname,status,batch_size): def sample_links(db,hostname,status,batch_size):
print("Getting backlinks") print("Getting backlinks")
@ -455,11 +456,12 @@ def sample_links(db,hostname,status,batch_size):
cl = LinkClassifier() cl = LinkClassifier()
crawled_links = list(res) crawled_links = list(res)
crawled_count = len(crawled_links) crawled_count = len(crawled_links)
min_train_size = 200
prediction_accuracy = 0 prediction_accuracy = 0
if crawled_count > min_train_size: if crawled_count > 200:
# train on crawled links # train on crawled links
prediction_accuracy = cl.train(crawled_links) trainset,testset = split_train(crawled_links)
cl.train(trainset)
prediction_accuracy = cl.test(testset)
sample_set_size = 10000 sample_set_size = 10000
res = linkcol.find({"host":hostname,"status": status},limit = sample_set_size) res = linkcol.find({"host":hostname,"status": status},limit = sample_set_size)
sample_links = [] sample_links = []
@ -467,12 +469,15 @@ def sample_links(db,hostname,status,batch_size):
for item in res: for item in res:
for item in res: for item in res:
cll = cl.classify(item["url"]) cll = cl.classify(item["url"])
cll += random.uniform(-0.1,0.1)
sample_links.append((item["url"],cll)) sample_links.append((item["url"],cll))
if cll > 0: if cll > 0:
predicted_good += 1 predicted_good += 1
# TODO frontlinks are not unique! # TODO frontlinks are not unique!
sample_links.sort(key=lambda x: x[1],reverse=True) sample_links.sort(key=lambda x: x[1],reverse=True)
predicted_good_prob = predicted_good / len(sample_links) predicted_good_prob = 0
if len(sample_links) > 0:
predicted_good_prob = predicted_good / len(sample_links)
domaincol = db["domain"] domaincol = db["domain"]
info = { info = {
"predicted_good_prob":predicted_good_prob, "predicted_good_prob":predicted_good_prob,
@ -507,11 +512,13 @@ def createdb():
linkcol.create_index("url",unique=True) linkcol.create_index("url",unique=True)
linkcol.create_index("host") linkcol.create_index("host")
contentcol = db["content"] contentcol = db["content"]
contentcol.create_index("url",unique=True) contentcol.create_index("url")
contentcol.create_index("text_md5",unique=True)
#contentcol.create_index({"paragraph_checksums":1}) #contentcol.create_index({"paragraph_checksums":1})
contentcol.create_index("host") contentcol.create_index("host")
htmlcol = db["html"] htmlcol = db["html"]
htmlcol.create_index("url",unique=True) htmlcol.create_index("url")
htmlcol.create_index("html_md5",unique=True)
domaincol = db["domains"] domaincol = db["domains"]
domaincol.create_index("host",unique=True) domaincol.create_index("host",unique=True)
@ -553,7 +560,12 @@ def classify(start_link):
db=myclient[DBNAME] db=myclient[DBNAME]
start_link,hostname = courlan.check_url(start_link) start_link,hostname = courlan.check_url(start_link)
cl = LinkClassifier() cl = LinkClassifier()
cl.train(db,hostname) linkcol = db["links"]
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}})
trainset, testset = split_train(res)
cl.train(trainset)
cl.test(testset)
@cli.command() @cli.command()
@click.argument("start_link") @click.argument("start_link")

View File

@ -8,3 +8,18 @@ mycol = mydb["customers"]
mydict = {"text":"ahoj svet"} mydict = {"text":"ahoj svet"}
x = mycol.insert_one(mydict) x = mycol.insert_one(mydict)
def createdb():
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
linkcol = db["links"]
linkcol.create_index("url",unique=True)
linkcol.create_index("host")
contentcol = db["content"]
contentcol.create_index("url",unique=True)
#contentcol.create_index({"paragraph_checksums":1})
contentcol.create_index("host")
htmlcol = db["html"]
htmlcol.create_index("url")
domaincol = db["domains"]
domaincol.create_index("host",unique=True)