This commit is contained in:
Daniel Hládek 2023-05-18 10:23:13 +02:00
parent a8f5b149f2
commit ff5b2f1513

View File

@ -241,7 +241,7 @@ def set_content_checksums(doc):
sentences += 1 sentences += 1
doc["sentences_count"] = sentences doc["sentences_count"] = sentences
def index_page(db,original_link,final_link,html,doc,filter_content=True): def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content=True):
linkcol = db["links"] linkcol = db["links"]
htmlcol = db["html"] htmlcol = db["html"]
contentcol = db["content"] contentcol = db["content"]
@ -277,7 +277,7 @@ def index_page(db,original_link,final_link,html,doc,filter_content=True):
htdoc = get_link_doc(link,state) htdoc = get_link_doc(link,state)
htdoc["html"] = html htdoc["html"] = html
htdoc["html_size"] = len(html) htdoc["html_size"] = len(html)
htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest() htdoc["html_md5"]= hashlib.md5(html).hexdigest()
# can be revisited - upsert # can be revisited - upsert
del htdoc["url"] del htdoc["url"]
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
@ -347,7 +347,7 @@ def index_links(db,extracted_links):
for link,status in extracted_links: for link,status in extracted_links:
if not is_link_good(link): if not is_link_good(link):
continue continue
if status == "frontlink" or status == "backlink": if status == "frontlink" :
doc = get_link_doc(link,status) doc = get_link_doc(link,status)
try: try:
linkcol.insert_one(doc) linkcol.insert_one(doc)
@ -509,7 +509,7 @@ def link_summary(db,hostname):
print(st,count) print(st,count)
if st == "good": if st == "good":
goodcount += count goodcount += count
if st != "frontlink" and st != "backlink": if st != "frontlink":
crawled_count += count crawled_count += count
if st != "good": if st != "good":
bad_crawl_count += count bad_crawl_count += count
@ -517,8 +517,6 @@ def link_summary(db,hostname):
info["crawled_count"] = crawled_count info["crawled_count"] = crawled_count
info["bad_crawl_count"] = bad_crawl_count info["bad_crawl_count"] = bad_crawl_count
baclink_cout = 0 baclink_cout = 0
if "backlink" in info:
backlink_count = info["backlink"]
good_prob= 0 good_prob= 0
if crawled_count > 0: if crawled_count > 0:
good_prob = goodcount / crawled_count good_prob = goodcount / crawled_count
@ -552,7 +550,7 @@ def link_summary(db,hostname):
def sample_links(db,hostname,status,batch_size): def sample_links(db,hostname,status,batch_size):
print("Sampling links") print("Sampling links")
linkcol = db["links"] linkcol = db["links"]
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}}) res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
cl = LinkClassifier() cl = LinkClassifier()
crawled_links = list(res) crawled_links = list(res)
crawled_count = len(crawled_links) crawled_count = len(crawled_links)
@ -678,7 +676,7 @@ def classify(start_link):
start_link,hostname = courlan.check_url(start_link) start_link,hostname = courlan.check_url(start_link)
cl = LinkClassifier() cl = LinkClassifier()
linkcol = db["links"] linkcol = db["links"]
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}}) res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
trainset, testset = split_train(res) trainset, testset = split_train(res)
cl.train(trainset) cl.train(trainset)
@ -764,7 +762,7 @@ def crawl_summary():
values = [str(item[x]) for x in headers] values = [str(item[x]) for x in headers]
print("\t".join(values)) print("\t".join(values))
def extr(hdoc): def _extr(hdoc):
url = hdoc["url"] url = hdoc["url"]
html = binascii.a2b_qp(hdoc["quoted_html"]) html = binascii.a2b_qp(hdoc["quoted_html"])
doc = extract_page(url,html) doc = extract_page(url,html)
@ -773,29 +771,30 @@ def extr(hdoc):
def import_html(): def import_html():
myclient= pymongo.MongoClient(CONNECTION) myclient= pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
contentcol = db["content"] linkscol = db["links"]
buffer = [] buffer = []
counter = 0 counter = 0
for i,l in enumerate(sys.stdin): for i,l in enumerate(sys.stdin):
hdoc = json.loads(l) hdoc = json.loads(l)
url = hdoc["url"] url = hdoc["url"]
r = contentcol.find_one({"url":url},projection=["_id"]) r = linkscol.find_one({"url":url})
if r is not None: if r is not None and r["status"] != "frontlink":
print(">>>>" + str(i) + " copy: " + url) print(">>>>" + str(i) + " copy: " + url)
continue continue
buffer.append(hdoc) buffer.append(hdoc)
if len(buffer) < 128: if len(buffer) < 128:
continue continue
from multiprocessing import Pool from multiprocessing import Pool
outs = []
with Pool(8) as p: with Pool(8) as p:
outs = p.map(extr,buffer) outs = p.map(_extr,buffer)
for hdoc,doc in zip(buffer,outs): for hdoc,doc in zip(buffer,outs):
if doc is None: if doc is None:
print("bad html" + hdoc["url"]) print("bad html" + hdoc["url"])
continue continue
status = index_page(db,hdoc["url"],hdoc["url"],hdoc["quoted_html"],doc) status = index_page(db,hdoc["url"],hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc)
counter += 1 counter += 1
print( ">>> " + str(counter) + " " + hdoc["url"] + " " + status) print( ">>> " + str(counter) + " " + str(i) + " " + hdoc["url"] + " " + status)
del buffer[:] del buffer[:]