From a8f5b149f202a7f2a9830688926a077c41395f71 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Tue, 16 May 2023 15:18:01 +0200 Subject: [PATCH] works --- mongo/mongocrawler.py | 84 ++++++++++++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 24 deletions(-) diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 880e267..608b0b7 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -172,7 +172,6 @@ def fetch_page(link:str)->(str,str): html = None if response is not None : good = True - print(response) if response.status != 200: good = False LOGGER.error('not a 200 response: %s for URL %s', response.status, url) @@ -212,8 +211,16 @@ def extract_page(final_link,html): if html is not None: doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True) if doc is not None: + lines = doc["text"].split("\n") + # filter out tables + good_lines = [] + for line in lines: + if line.startswith("|") or line.startswith("1 2 3 4") or line.startswith("12345"): + continue + good_lines.append(line) + doc["text"] = "\n".join(good_lines) + # text too small if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE: - # text too small doc = None return doc @@ -642,12 +649,13 @@ def parseurl(link): print(rules.crawl_delay("*")) html = trafilatura.fetch_url(link,decode=True) get_bs_links(link,html) - doc = trafilatura.bare_extraction(html) - import pprint - pprint.pprint(doc) - internal_links, external_links = get_bs_links(link,html) - print(internal_links) - print(external_links) + doc = extract_page(link,html) + if doc is not None: + import pprint + pprint.pprint(doc) + internal_links, external_links = get_bs_links(link,html) + print(internal_links) + print(external_links) @@ -719,6 +727,21 @@ def visit(hostname,filter_content=True): def crawl_summary(): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] + contentcol = db["content"] + res = contentcol.aggregate([ + {"$group":{"_id":None,"total_text_size":{"$sum":"$text_size"}}} + ]) + print(">>>>> Total text size in content") + for item in res: + print(item) + linkscol = db["links"] + # find counts of link statuses + res = linkscol.aggregate([ + {"$group":{"_id":"$status","count":{"$sum":1}}} + ]) + print(">>>>> Link status counts") + for item in res: + print(item["_id"],item["count"]) batchcol = db["batches"] yesterday = datetime.datetime.today() - datetime.timedelta(days=1) print(yesterday) @@ -732,6 +755,7 @@ def crawl_summary(): } }, {"$sort":{"original_text_size":-1}}, + {"$limit":100}, ]) print(">>>> Batches") headers = ["_id","document_count","good_document_count","batch_size","original_text_size"] @@ -739,29 +763,41 @@ def crawl_summary(): for item in res: values = [str(item[x]) for x in headers] print("\t".join(values)) - contentcol = db["content"] - res = contentcol.aggregate([ - {"$group":{"_id":None,total_text_size:{"$sum":"$text_size"}}} - ]) - print(">>>>> Total text size in content") - for item in res: - print(res) +def extr(hdoc): + url = hdoc["url"] + html = binascii.a2b_qp(hdoc["quoted_html"]) + doc = extract_page(url,html) + return doc def import_html(): myclient= pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] - for l in sys.stdin: + contentcol = db["content"] + buffer = [] + counter = 0 + for i,l in enumerate(sys.stdin): hdoc = json.loads(l) url = hdoc["url"] - # beautifusoup is to unify encoding - html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify() - doc = extract_page(url,html) - if doc is not None: - print("------=====-") - print(doc) - status = index_page(db,url,url,html,doc) - print(status) + r = contentcol.find_one({"url":url},projection=["_id"]) + if r is not None: + print(">>>>" + str(i) + " copy: " + url) + continue + buffer.append(hdoc) + if len(buffer) < 128: + continue + from multiprocessing import Pool + with Pool(8) as p: + outs = p.map(extr,buffer) + for hdoc,doc in zip(buffer,outs): + if doc is None: + print("bad html" + hdoc["url"]) + continue + status = index_page(db,hdoc["url"],hdoc["url"],hdoc["quoted_html"],doc) + counter += 1 + print( ">>> " + str(counter) + " " + hdoc["url"] + " " + status) + del buffer[:] + def sample_domains(): myclient = pymongo.MongoClient(CONNECTION)