zz
This commit is contained in:
parent
a26613ebb1
commit
9a9e8da4cf
@ -19,6 +19,7 @@ import time
|
|||||||
import collections
|
import collections
|
||||||
import math
|
import math
|
||||||
import random
|
import random
|
||||||
|
import hashlib
|
||||||
|
|
||||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
||||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
||||||
@ -32,17 +33,15 @@ CHECK_PARAGRAPH_SIZE=150
|
|||||||
TEXT_TRASH_SIZE=200
|
TEXT_TRASH_SIZE=200
|
||||||
TEXT_TRASH_RATIO=0.6
|
TEXT_TRASH_RATIO=0.6
|
||||||
|
|
||||||
def put_queue(db,channel,message):
|
def split_train(res):
|
||||||
queuecol = db["queue"]
|
trainset = []
|
||||||
queuecol.insert_one({"channel":channel,"message":message,"created_at":datetime.utcnow(),"started_at":None})
|
testset = []
|
||||||
|
for i,item in enumerate(res):
|
||||||
def reserve_queue(db,channel,message):
|
if i % 10 == 0:
|
||||||
queuecol = db["queue"]
|
testset.append(item)
|
||||||
r = queuecol.find_one_and_delete({"channel":channel},sort={"created_at":-1})
|
else:
|
||||||
|
trainset.append(item)
|
||||||
def delete_queue(db,channel):
|
return trainset,testset
|
||||||
queuecol = db["queue"]
|
|
||||||
pass
|
|
||||||
|
|
||||||
def calculate_checksums(text):
|
def calculate_checksums(text):
|
||||||
"""
|
"""
|
||||||
@ -181,6 +180,7 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
text = doc["text"]
|
text = doc["text"]
|
||||||
checksums,sizes = calculate_checksums(text)
|
checksums,sizes = calculate_checksums(text)
|
||||||
doc["text_size"] = len(text)
|
doc["text_size"] = len(text)
|
||||||
|
doc["text_md5"] = hashlib.md5(text.encode("utf8")).hexdigest()
|
||||||
doc["paragraph_checksums"] = checksums
|
doc["paragraph_checksums"] = checksums
|
||||||
doc["paragraph_sizes"] = sizes
|
doc["paragraph_sizes"] = sizes
|
||||||
goodsz = sum(sizes)
|
goodsz = sum(sizes)
|
||||||
@ -209,6 +209,7 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
htdoc = get_link_doc(link,state)
|
htdoc = get_link_doc(link,state)
|
||||||
htdoc["html"] = html
|
htdoc["html"] = html
|
||||||
htdoc["html_size"] = len(html)
|
htdoc["html_size"] = len(html)
|
||||||
|
htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest()
|
||||||
# can be revisited - upsert
|
# can be revisited - upsert
|
||||||
del htdoc["url"]
|
del htdoc["url"]
|
||||||
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
||||||
@ -296,7 +297,6 @@ class LinkClassifier:
|
|||||||
self.alpha = 0.001
|
self.alpha = 0.001
|
||||||
|
|
||||||
def train(self,links):
|
def train(self,links):
|
||||||
testset = []
|
|
||||||
for i,item in enumerate(links):
|
for i,item in enumerate(links):
|
||||||
link = item["url"]
|
link = item["url"]
|
||||||
state = item["status"]
|
state = item["status"]
|
||||||
@ -304,9 +304,6 @@ class LinkClassifier:
|
|||||||
if state == "good":
|
if state == "good":
|
||||||
cl = 1
|
cl = 1
|
||||||
print(cl,state,link)
|
print(cl,state,link)
|
||||||
if i % 10 == 1:
|
|
||||||
testset.append((link,cl))
|
|
||||||
continue
|
|
||||||
features = get_link_features(link)
|
features = get_link_features(link)
|
||||||
if features is None:
|
if features is None:
|
||||||
continue
|
continue
|
||||||
@ -321,9 +318,15 @@ class LinkClassifier:
|
|||||||
self.badcounter[feature] += 1
|
self.badcounter[feature] += 1
|
||||||
self.bdictsize = len(self.badcounter)
|
self.bdictsize = len(self.badcounter)
|
||||||
self.gdictsize = len(self.goodcounter)
|
self.gdictsize = len(self.goodcounter)
|
||||||
|
|
||||||
|
def test(self,testset):
|
||||||
# eval
|
# eval
|
||||||
gg = 0
|
gg = 0
|
||||||
for l,cl in testset:
|
for item in testset:
|
||||||
|
l = item["url"]
|
||||||
|
cl = 0
|
||||||
|
if item["status"] == "good":
|
||||||
|
cl = 1
|
||||||
pcp = self.classify(l)
|
pcp = self.classify(l)
|
||||||
r = 0
|
r = 0
|
||||||
if pcp > 0:
|
if pcp > 0:
|
||||||
@ -339,7 +342,7 @@ class LinkClassifier:
|
|||||||
return acc
|
return acc
|
||||||
|
|
||||||
def classify(self,link):
|
def classify(self,link):
|
||||||
if self.good_count + self.bad_count == 0:
|
if self.good_count == 0 or self.bad_count == 0:
|
||||||
return random.uniform(-0.1,0.1)
|
return random.uniform(-0.1,0.1)
|
||||||
features = get_link_features(link)
|
features = get_link_features(link)
|
||||||
res = 0
|
res = 0
|
||||||
@ -352,17 +355,14 @@ class LinkClassifier:
|
|||||||
goodprob = 0
|
goodprob = 0
|
||||||
badprob = 0
|
badprob = 0
|
||||||
for feature in features:
|
for feature in features:
|
||||||
g = math.log((self.goodcounter[feature] + self.alpha)) - gcc
|
g = math.log((self.goodcounter[feature] + self.alpha)) - gcc
|
||||||
goodprob += g
|
goodprob += g
|
||||||
b = math.log(self.badcounter[feature] + self.alpha) - bcc
|
b = math.log(self.badcounter[feature] + self.alpha) - bcc
|
||||||
badprob += b
|
badprob += b
|
||||||
print(feature,g,b)
|
print(feature,g,b)
|
||||||
if (goodprob + gp) > (badprob + bp):
|
|
||||||
#if goodprob > badprob:
|
|
||||||
res = 1
|
|
||||||
pa = math.exp(goodprob + gp)
|
pa = math.exp(goodprob + gp)
|
||||||
pb = math.exp(badprob + bp)
|
pb = math.exp(badprob + bp)
|
||||||
return pa - pb
|
return pa - pb #+ random.uniform(-0.001,0.001)
|
||||||
|
|
||||||
|
|
||||||
def get_links(db,hostname,status,batch_size):
|
def get_links(db,hostname,status,batch_size):
|
||||||
@ -445,8 +445,9 @@ def link_summary(db,hostname):
|
|||||||
info["average_good_characters"] = good_document_characters
|
info["average_good_characters"] = good_document_characters
|
||||||
info["average_fetch_characters"] = fetch_average_characters
|
info["average_fetch_characters"] = fetch_average_characters
|
||||||
domaincol = db["domain"]
|
domaincol = db["domain"]
|
||||||
print(info)
|
|
||||||
domaincol.update_one({"host":hostname},{"$set":info},upsert=True)
|
domaincol.update_one({"host":hostname},{"$set":info},upsert=True)
|
||||||
|
res = domaincol.find_one({"host":hostname})
|
||||||
|
print(res)
|
||||||
|
|
||||||
def sample_links(db,hostname,status,batch_size):
|
def sample_links(db,hostname,status,batch_size):
|
||||||
print("Getting backlinks")
|
print("Getting backlinks")
|
||||||
@ -455,11 +456,12 @@ def sample_links(db,hostname,status,batch_size):
|
|||||||
cl = LinkClassifier()
|
cl = LinkClassifier()
|
||||||
crawled_links = list(res)
|
crawled_links = list(res)
|
||||||
crawled_count = len(crawled_links)
|
crawled_count = len(crawled_links)
|
||||||
min_train_size = 200
|
|
||||||
prediction_accuracy = 0
|
prediction_accuracy = 0
|
||||||
if crawled_count > min_train_size:
|
if crawled_count > 200:
|
||||||
# train on crawled links
|
# train on crawled links
|
||||||
prediction_accuracy = cl.train(crawled_links)
|
trainset,testset = split_train(crawled_links)
|
||||||
|
cl.train(trainset)
|
||||||
|
prediction_accuracy = cl.test(testset)
|
||||||
sample_set_size = 10000
|
sample_set_size = 10000
|
||||||
res = linkcol.find({"host":hostname,"status": status},limit = sample_set_size)
|
res = linkcol.find({"host":hostname,"status": status},limit = sample_set_size)
|
||||||
sample_links = []
|
sample_links = []
|
||||||
@ -467,12 +469,15 @@ def sample_links(db,hostname,status,batch_size):
|
|||||||
for item in res:
|
for item in res:
|
||||||
for item in res:
|
for item in res:
|
||||||
cll = cl.classify(item["url"])
|
cll = cl.classify(item["url"])
|
||||||
|
cll += random.uniform(-0.1,0.1)
|
||||||
sample_links.append((item["url"],cll))
|
sample_links.append((item["url"],cll))
|
||||||
if cll > 0:
|
if cll > 0:
|
||||||
predicted_good += 1
|
predicted_good += 1
|
||||||
# TODO frontlinks are not unique!
|
# TODO frontlinks are not unique!
|
||||||
sample_links.sort(key=lambda x: x[1],reverse=True)
|
sample_links.sort(key=lambda x: x[1],reverse=True)
|
||||||
predicted_good_prob = predicted_good / len(sample_links)
|
predicted_good_prob = 0
|
||||||
|
if len(sample_links) > 0:
|
||||||
|
predicted_good_prob = predicted_good / len(sample_links)
|
||||||
domaincol = db["domain"]
|
domaincol = db["domain"]
|
||||||
info = {
|
info = {
|
||||||
"predicted_good_prob":predicted_good_prob,
|
"predicted_good_prob":predicted_good_prob,
|
||||||
@ -507,11 +512,13 @@ def createdb():
|
|||||||
linkcol.create_index("url",unique=True)
|
linkcol.create_index("url",unique=True)
|
||||||
linkcol.create_index("host")
|
linkcol.create_index("host")
|
||||||
contentcol = db["content"]
|
contentcol = db["content"]
|
||||||
contentcol.create_index("url",unique=True)
|
contentcol.create_index("url")
|
||||||
|
contentcol.create_index("text_md5",unique=True)
|
||||||
#contentcol.create_index({"paragraph_checksums":1})
|
#contentcol.create_index({"paragraph_checksums":1})
|
||||||
contentcol.create_index("host")
|
contentcol.create_index("host")
|
||||||
htmlcol = db["html"]
|
htmlcol = db["html"]
|
||||||
htmlcol.create_index("url",unique=True)
|
htmlcol.create_index("url")
|
||||||
|
htmlcol.create_index("html_md5",unique=True)
|
||||||
domaincol = db["domains"]
|
domaincol = db["domains"]
|
||||||
domaincol.create_index("host",unique=True)
|
domaincol.create_index("host",unique=True)
|
||||||
|
|
||||||
@ -553,7 +560,12 @@ def classify(start_link):
|
|||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
start_link,hostname = courlan.check_url(start_link)
|
start_link,hostname = courlan.check_url(start_link)
|
||||||
cl = LinkClassifier()
|
cl = LinkClassifier()
|
||||||
cl.train(db,hostname)
|
linkcol = db["links"]
|
||||||
|
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}})
|
||||||
|
trainset, testset = split_train(res)
|
||||||
|
|
||||||
|
cl.train(trainset)
|
||||||
|
cl.test(testset)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("start_link")
|
@click.argument("start_link")
|
||||||
|
@ -8,3 +8,18 @@ mycol = mydb["customers"]
|
|||||||
mydict = {"text":"ahoj svet"}
|
mydict = {"text":"ahoj svet"}
|
||||||
|
|
||||||
x = mycol.insert_one(mydict)
|
x = mycol.insert_one(mydict)
|
||||||
|
|
||||||
|
def createdb():
|
||||||
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
|
db=myclient[DBNAME]
|
||||||
|
linkcol = db["links"]
|
||||||
|
linkcol.create_index("url",unique=True)
|
||||||
|
linkcol.create_index("host")
|
||||||
|
contentcol = db["content"]
|
||||||
|
contentcol.create_index("url",unique=True)
|
||||||
|
#contentcol.create_index({"paragraph_checksums":1})
|
||||||
|
contentcol.create_index("host")
|
||||||
|
htmlcol = db["html"]
|
||||||
|
htmlcol.create_index("url")
|
||||||
|
domaincol = db["domains"]
|
||||||
|
domaincol.create_index("host",unique=True)
|
||||||
|
Loading…
Reference in New Issue
Block a user