diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 608b0b7..2e077b3 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -241,7 +241,7 @@ def set_content_checksums(doc): sentences += 1 doc["sentences_count"] = sentences -def index_page(db,original_link,final_link,html,doc,filter_content=True): +def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content=True): linkcol = db["links"] htmlcol = db["html"] contentcol = db["content"] @@ -277,7 +277,7 @@ def index_page(db,original_link,final_link,html,doc,filter_content=True): htdoc = get_link_doc(link,state) htdoc["html"] = html htdoc["html_size"] = len(html) - htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest() + htdoc["html_md5"]= hashlib.md5(html).hexdigest() # can be revisited - upsert del htdoc["url"] htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) @@ -347,7 +347,7 @@ def index_links(db,extracted_links): for link,status in extracted_links: if not is_link_good(link): continue - if status == "frontlink" or status == "backlink": + if status == "frontlink" : doc = get_link_doc(link,status) try: linkcol.insert_one(doc) @@ -509,7 +509,7 @@ def link_summary(db,hostname): print(st,count) if st == "good": goodcount += count - if st != "frontlink" and st != "backlink": + if st != "frontlink": crawled_count += count if st != "good": bad_crawl_count += count @@ -517,8 +517,6 @@ def link_summary(db,hostname): info["crawled_count"] = crawled_count info["bad_crawl_count"] = bad_crawl_count baclink_cout = 0 - if "backlink" in info: - backlink_count = info["backlink"] good_prob= 0 if crawled_count > 0: good_prob = goodcount / crawled_count @@ -552,7 +550,7 @@ def link_summary(db,hostname): def sample_links(db,hostname,status,batch_size): print("Sampling links") linkcol = db["links"] - res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}}) + res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}}) cl = LinkClassifier() crawled_links = list(res) crawled_count = len(crawled_links) @@ -678,7 +676,7 @@ def classify(start_link): start_link,hostname = courlan.check_url(start_link) cl = LinkClassifier() linkcol = db["links"] - res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}}) + res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}}) trainset, testset = split_train(res) cl.train(trainset) @@ -764,7 +762,7 @@ def crawl_summary(): values = [str(item[x]) for x in headers] print("\t".join(values)) -def extr(hdoc): +def _extr(hdoc): url = hdoc["url"] html = binascii.a2b_qp(hdoc["quoted_html"]) doc = extract_page(url,html) @@ -773,30 +771,31 @@ def extr(hdoc): def import_html(): myclient= pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] - contentcol = db["content"] + linkscol = db["links"] buffer = [] counter = 0 for i,l in enumerate(sys.stdin): hdoc = json.loads(l) url = hdoc["url"] - r = contentcol.find_one({"url":url},projection=["_id"]) - if r is not None: + r = linkscol.find_one({"url":url}) + if r is not None and r["status"] != "frontlink": print(">>>>" + str(i) + " copy: " + url) continue buffer.append(hdoc) if len(buffer) < 128: continue from multiprocessing import Pool + outs = [] with Pool(8) as p: - outs = p.map(extr,buffer) - for hdoc,doc in zip(buffer,outs): - if doc is None: - print("bad html" + hdoc["url"]) - continue - status = index_page(db,hdoc["url"],hdoc["url"],hdoc["quoted_html"],doc) - counter += 1 - print( ">>> " + str(counter) + " " + hdoc["url"] + " " + status) - del buffer[:] + outs = p.map(_extr,buffer) + for hdoc,doc in zip(buffer,outs): + if doc is None: + print("bad html" + hdoc["url"]) + continue + status = index_page(db,hdoc["url"],hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc) + counter += 1 + print( ">>> " + str(counter) + " " + str(i) + " " + hdoc["url"] + " " + status) + del buffer[:] def sample_domains():