diff --git a/mongo/cli.py b/mongo/cli.py index 093dcc9..89538ef 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -1,6 +1,8 @@ import click import mongocrawler import rq +import redis +import sys import os REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/") diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 7179f82..6da2061 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -154,21 +154,16 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser: # exceptions happening here return rules +def extract_page(final_link,html): + doc = None + if html is not None: + doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True) + if doc is not None: + if not "text" in doc or len(doc["text"]) < MINTEXTSIZE: + # text too small + doc = None + return doc -def extract_pages(link_batch:list,responses:list)->list: - out = [] - for original_link,(final_link,html) in zip(link_batch,responses): - doc = None - assert original_link is not None - if html is not None: - doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True) - if doc is not None: - if not "text" in doc or len(doc["text"]) < MINTEXTSIZE: - # text too small - doc = None - - out.append((original_link,final_link,html,doc)) - return out def set_content_checksums(doc): text = doc["text"] @@ -186,85 +181,82 @@ def set_content_checksums(doc): sentences += 1 doc["sentences_count"] = sentences -def index_pages(db,hostname,extracted_pages): +def index_page(db,original_link,final_link,html,doc): linkcol = db["links"] htmlcol = db["html"] contentcol = db["content"] checkcol = db["check"] - links = [] - # stats of the batch - good_document_count = 0 - document_count = 0 - text_size = 0 - good_text_size = 0 - original_text_size = 0 - for original_link,final_link,html,doc in extracted_pages: - document_count += 1 - state = "good" - link = original_link - if original_link != final_link: - print(original_link,final_link) - linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}}) - link = final_link - if html is None: - state = "html_error" - elif doc is None: - state = "content_error" - if doc is not None: - set_content_checksums(doc) - tsz = doc["text_size"] - text_size += tsz - psz = doc["paragraph_sizes_sum"] - if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO: - state = "small" - # check copy - if state == "good": - origsz = 0 - for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]): - # index paragraph checksums - nd = checkcol.find_one({"_id":chs}) - if nd is None: - origsz += paragraph_size + state = "good" + link = original_link + if original_link != final_link: + print(original_link,final_link) + linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}}) + link = final_link + if html is None: + state = "html_error" + elif doc is None: + state = "content_error" + if doc is not None: + set_content_checksums(doc) + tsz = doc["text_size"] + psz = doc["paragraph_sizes_sum"] + if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO: + state = "small" + # check copy + if state == "good": + origsz = 0 + for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]): + # index paragraph checksums + nd = checkcol.find_one({"_id":chs}) + if nd is None: + origsz += paragraph_size + doc["original_text_size"] = origsz - if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO: - state = "copy" - original_text_size += origsz - print(origsz) + if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO: + state = "copy" + print(origsz) + if state == "good": + htdoc = get_link_doc(link,state) + htdoc["html"] = html + htdoc["html_size"] = len(html) + htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest() + # can be revisited - upsert + del htdoc["url"] + htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) + doc.update(get_link_doc(link,"good")) + # todo extract links + print(link,doc) + del doc["url"] + contentcol.update_one({"url":link},{"$set":doc},upsert=True) + for chs in doc["paragraph_checksums"]: + try: + checkcol.insert_one({"_id":chs}) + except pymongo.errors.DuplicateKeyError as err: + pass + + linkdoc = get_link_doc(link,state) + del linkdoc["url"] + linkcol.update_one({"url":link},{"$set":linkdoc}) + return state + +def save_batch_info(db,host,states,docs): + good_document_count = 0 + original_text_size = 0 + batch_size = 0 + _,domain = courlan.get_hostinfo(host) + for state,doc in zip(states,docs): + batch_size += 1 if state == "good": good_document_count += 1 - good_text_size += doc["text_size"] - htdoc = get_link_doc(link,state) - htdoc["html"] = html - htdoc["html_size"] = len(html) - htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest() - # can be revisited - upsert - del htdoc["url"] - htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) - doc.update(get_link_doc(link,"good")) - # todo extract links - print(link,doc) - del doc["url"] - contentcol.update_one({"url":link},{"$set":doc},upsert=True) - for chs in doc["paragraph_checksums"]: - try: - checkcol.insert_one({"_id":chs}) - except pymongo.errors.DuplicateKeyError as err: - pass - - linkdoc = get_link_doc(link,state) - del linkdoc["url"] - linkcol.update_one({"url":link},{"$set":linkdoc}) + original_text_size += doc["original_text_size"] batchdoc = { - "host": linkdoc["host"], - "domain": linkdoc["domain"], + "host": host, + "domain": domain, "created_at": dat.utcnow(), "good_document_count":good_document_count, - "document_count":document_count, - "text_size":text_size, - "good_text_size":good_text_size, "original_text_size":original_text_size, - "batch_size": BATCHSIZE, - "average_fetch_characters": text_size / BATCHSIZE, + "good_prob": good_document_count / batch_size, + "batch_size": batch_size, } db["batches"].insert_one(batchdoc) print(batchdoc) @@ -699,10 +691,23 @@ def visit(hostname): responses = [] for link in links: responses.append(fetch_page(link)) - extracted_pages = extract_pages(links,responses) + + extracted_pages = [] + for original_link,(final_link,html) in zip(links,responses): + doc = None + assert original_link is not None + doc = extract_page(final_link,html) + extracted_pages.append((original_link,final_link,html,doc)) + extracted_links = extract_links(links,responses,hostname,rules,"frontlink") index_links(db,extracted_links) - index_pages(db,hostname,extracted_pages) + final_states = [] + docs = [] + for original_link,final_link,html,doc in extracted_pages: + status = index_page(db,original_link,final_link,html,doc) + final_states.append(status) + docs.append(doc) + save_batch_info(db,hostname,final_states,docs) link_summary(db,hostname) def crawl_summary(): @@ -719,17 +724,28 @@ def crawl_summary(): "batch_count":{"$sum":"$batch_size"}, "text_size":{"$sum":"$text_size"}, "original_text_size":{"$sum":"$original_text_size"}, - "count":{"$sum":1}, } }, + {"$sort":{"original_text_size":-1}}, ]) print(">>>> Batches") - headers = ["_id","document_count","good_document_count","count","batch_count","text_size","original_text_size"] + headers = ["_id","document_count","good_document_count","batch_count","text_size","original_text_size"] print("\t".join(headers)) for item in res: values = [str(item[x]) for x in headers] print("\t".join(values)) +import binascii + +def import_html(): + myclient = pymongo.MongoClient(CONNECTION) + for l in sys.stdin: + hdoc = json.loads(l) + url = hdoc["url"] + html = bs4.BeautifulSoup(binascii.b2a_qp(hdoc["quoted_html"])).prettify() + doc = extract_pages(url,html) + index_page(db,url,url,html,doc) + def sample_domains(): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME]