Compare commits
No commits in common. "deced51d48592d59f3015a77abc8d32efc090173" and "22ae282674f2fbbaadc88cf25e39b9d92c62cf9e" have entirely different histories.
deced51d48
...
22ae282674
@ -18,11 +18,10 @@ import re
|
|||||||
import time
|
import time
|
||||||
import collections
|
import collections
|
||||||
import math
|
import math
|
||||||
import json
|
|
||||||
|
|
||||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
||||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
||||||
BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",10)
|
BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",100)
|
||||||
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
||||||
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
||||||
MINFILESIZE=300
|
MINFILESIZE=300
|
||||||
@ -56,7 +55,7 @@ def calculate_checksums(text):
|
|||||||
for c in text:
|
for c in text:
|
||||||
cv = ord(c)
|
cv = ord(c)
|
||||||
sz += 1
|
sz += 1
|
||||||
if cv > 64: # ignore non-ascii
|
if cv > 64:
|
||||||
hval += (hval << 3) + cv
|
hval += (hval << 3) + cv
|
||||||
zv = hval >> 31
|
zv = hval >> 31
|
||||||
hval &= 0x7fffffff
|
hval &= 0x7fffffff
|
||||||
@ -406,7 +405,7 @@ def get_links(db,hostname,status,batch_size):
|
|||||||
outlinks.append((doc["url"],cl.classify(link)))
|
outlinks.append((doc["url"],cl.classify(link)))
|
||||||
outlinks = sorted(outlinks, key=lambda x: x[1],reverse=True)
|
outlinks = sorted(outlinks, key=lambda x: x[1],reverse=True)
|
||||||
links = [l[0] for l in outlinks[0:batch_size]]
|
links = [l[0] for l in outlinks[0:batch_size]]
|
||||||
# todo remove very bad links from database
|
# todo remove very bad links
|
||||||
return list(links)
|
return list(links)
|
||||||
|
|
||||||
|
|
||||||
@ -464,35 +463,14 @@ def link_summary(db,hostname):
|
|||||||
text_size = 0
|
text_size = 0
|
||||||
for item in res:
|
for item in res:
|
||||||
text_size = item["text_size_sum"]
|
text_size = item["text_size_sum"]
|
||||||
good_document_characters = 0
|
|
||||||
if goodcount > 0:
|
|
||||||
good_document_characters = text_size / goodcount
|
good_document_characters = text_size / goodcount
|
||||||
fetch_average_characters = text_size / (goodcount + badcount)
|
fetch_average_characters = text_size / (goodcount + badcount)
|
||||||
info["total_good_characters"] = text_size
|
info["total_good_characters"] = text_size
|
||||||
info["average_good_characters"] = good_document_characters
|
info["average_good_characters"] = good_document_characters
|
||||||
info["average_fetch_characters"] = fetch_average_characters
|
info["average_fetch_characters"] = fetch_average_characters
|
||||||
domaincol = db["domain"]
|
domaincol = db["domain"]
|
||||||
if goodcount + badcount > 100:
|
print(json.dumps(info))
|
||||||
cl = LinkClassifier()
|
domaincol.update_one({"host":domain},{"$set":info},usert=True)
|
||||||
cl.train(db,hostname)
|
|
||||||
res = linkcol.aggregate([
|
|
||||||
{ "$match": { "status": "backlink","host":hostname } },
|
|
||||||
{ "$sample": { "size": BATCHSIZE * 100 } }
|
|
||||||
])
|
|
||||||
predicted_good = 0
|
|
||||||
predicted_bad = 0
|
|
||||||
for item in res:
|
|
||||||
cll = cl.classify(item["url"])
|
|
||||||
if cll > 0:
|
|
||||||
predicted_good += 1
|
|
||||||
else:
|
|
||||||
predicted_bad += 1
|
|
||||||
predicted_good_prob = 0
|
|
||||||
if predicted_good + predicted_bad > 0:
|
|
||||||
predicted_good_prob = predicted_good / (predicted_good + predicted_bad)
|
|
||||||
info["predicted_good_prob"] = predicted_good_prob
|
|
||||||
print(info)
|
|
||||||
domaincol.update_one({"host":hostname},{"$set":info},upsert=True)
|
|
||||||
|
|
||||||
def domain_summary(db,hostname):
|
def domain_summary(db,hostname):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
|
Loading…
Reference in New Issue
Block a user