zz
This commit is contained in:
parent
a8f5b149f2
commit
ff5b2f1513
@ -241,7 +241,7 @@ def set_content_checksums(doc):
|
|||||||
sentences += 1
|
sentences += 1
|
||||||
doc["sentences_count"] = sentences
|
doc["sentences_count"] = sentences
|
||||||
|
|
||||||
def index_page(db,original_link,final_link,html,doc,filter_content=True):
|
def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content=True):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
htmlcol = db["html"]
|
htmlcol = db["html"]
|
||||||
contentcol = db["content"]
|
contentcol = db["content"]
|
||||||
@ -277,7 +277,7 @@ def index_page(db,original_link,final_link,html,doc,filter_content=True):
|
|||||||
htdoc = get_link_doc(link,state)
|
htdoc = get_link_doc(link,state)
|
||||||
htdoc["html"] = html
|
htdoc["html"] = html
|
||||||
htdoc["html_size"] = len(html)
|
htdoc["html_size"] = len(html)
|
||||||
htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest()
|
htdoc["html_md5"]= hashlib.md5(html).hexdigest()
|
||||||
# can be revisited - upsert
|
# can be revisited - upsert
|
||||||
del htdoc["url"]
|
del htdoc["url"]
|
||||||
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
||||||
@ -347,7 +347,7 @@ def index_links(db,extracted_links):
|
|||||||
for link,status in extracted_links:
|
for link,status in extracted_links:
|
||||||
if not is_link_good(link):
|
if not is_link_good(link):
|
||||||
continue
|
continue
|
||||||
if status == "frontlink" or status == "backlink":
|
if status == "frontlink" :
|
||||||
doc = get_link_doc(link,status)
|
doc = get_link_doc(link,status)
|
||||||
try:
|
try:
|
||||||
linkcol.insert_one(doc)
|
linkcol.insert_one(doc)
|
||||||
@ -509,7 +509,7 @@ def link_summary(db,hostname):
|
|||||||
print(st,count)
|
print(st,count)
|
||||||
if st == "good":
|
if st == "good":
|
||||||
goodcount += count
|
goodcount += count
|
||||||
if st != "frontlink" and st != "backlink":
|
if st != "frontlink":
|
||||||
crawled_count += count
|
crawled_count += count
|
||||||
if st != "good":
|
if st != "good":
|
||||||
bad_crawl_count += count
|
bad_crawl_count += count
|
||||||
@ -517,8 +517,6 @@ def link_summary(db,hostname):
|
|||||||
info["crawled_count"] = crawled_count
|
info["crawled_count"] = crawled_count
|
||||||
info["bad_crawl_count"] = bad_crawl_count
|
info["bad_crawl_count"] = bad_crawl_count
|
||||||
baclink_cout = 0
|
baclink_cout = 0
|
||||||
if "backlink" in info:
|
|
||||||
backlink_count = info["backlink"]
|
|
||||||
good_prob= 0
|
good_prob= 0
|
||||||
if crawled_count > 0:
|
if crawled_count > 0:
|
||||||
good_prob = goodcount / crawled_count
|
good_prob = goodcount / crawled_count
|
||||||
@ -552,7 +550,7 @@ def link_summary(db,hostname):
|
|||||||
def sample_links(db,hostname,status,batch_size):
|
def sample_links(db,hostname,status,batch_size):
|
||||||
print("Sampling links")
|
print("Sampling links")
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}})
|
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
|
||||||
cl = LinkClassifier()
|
cl = LinkClassifier()
|
||||||
crawled_links = list(res)
|
crawled_links = list(res)
|
||||||
crawled_count = len(crawled_links)
|
crawled_count = len(crawled_links)
|
||||||
@ -678,7 +676,7 @@ def classify(start_link):
|
|||||||
start_link,hostname = courlan.check_url(start_link)
|
start_link,hostname = courlan.check_url(start_link)
|
||||||
cl = LinkClassifier()
|
cl = LinkClassifier()
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}})
|
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
|
||||||
trainset, testset = split_train(res)
|
trainset, testset = split_train(res)
|
||||||
|
|
||||||
cl.train(trainset)
|
cl.train(trainset)
|
||||||
@ -764,7 +762,7 @@ def crawl_summary():
|
|||||||
values = [str(item[x]) for x in headers]
|
values = [str(item[x]) for x in headers]
|
||||||
print("\t".join(values))
|
print("\t".join(values))
|
||||||
|
|
||||||
def extr(hdoc):
|
def _extr(hdoc):
|
||||||
url = hdoc["url"]
|
url = hdoc["url"]
|
||||||
html = binascii.a2b_qp(hdoc["quoted_html"])
|
html = binascii.a2b_qp(hdoc["quoted_html"])
|
||||||
doc = extract_page(url,html)
|
doc = extract_page(url,html)
|
||||||
@ -773,30 +771,31 @@ def extr(hdoc):
|
|||||||
def import_html():
|
def import_html():
|
||||||
myclient= pymongo.MongoClient(CONNECTION)
|
myclient= pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
contentcol = db["content"]
|
linkscol = db["links"]
|
||||||
buffer = []
|
buffer = []
|
||||||
counter = 0
|
counter = 0
|
||||||
for i,l in enumerate(sys.stdin):
|
for i,l in enumerate(sys.stdin):
|
||||||
hdoc = json.loads(l)
|
hdoc = json.loads(l)
|
||||||
url = hdoc["url"]
|
url = hdoc["url"]
|
||||||
r = contentcol.find_one({"url":url},projection=["_id"])
|
r = linkscol.find_one({"url":url})
|
||||||
if r is not None:
|
if r is not None and r["status"] != "frontlink":
|
||||||
print(">>>>" + str(i) + " copy: " + url)
|
print(">>>>" + str(i) + " copy: " + url)
|
||||||
continue
|
continue
|
||||||
buffer.append(hdoc)
|
buffer.append(hdoc)
|
||||||
if len(buffer) < 128:
|
if len(buffer) < 128:
|
||||||
continue
|
continue
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
|
outs = []
|
||||||
with Pool(8) as p:
|
with Pool(8) as p:
|
||||||
outs = p.map(extr,buffer)
|
outs = p.map(_extr,buffer)
|
||||||
for hdoc,doc in zip(buffer,outs):
|
for hdoc,doc in zip(buffer,outs):
|
||||||
if doc is None:
|
if doc is None:
|
||||||
print("bad html" + hdoc["url"])
|
print("bad html" + hdoc["url"])
|
||||||
continue
|
continue
|
||||||
status = index_page(db,hdoc["url"],hdoc["url"],hdoc["quoted_html"],doc)
|
status = index_page(db,hdoc["url"],hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc)
|
||||||
counter += 1
|
counter += 1
|
||||||
print( ">>> " + str(counter) + " " + hdoc["url"] + " " + status)
|
print( ">>> " + str(counter) + " " + str(i) + " " + hdoc["url"] + " " + status)
|
||||||
del buffer[:]
|
del buffer[:]
|
||||||
|
|
||||||
|
|
||||||
def sample_domains():
|
def sample_domains():
|
||||||
|
Loading…
Reference in New Issue
Block a user