This commit is contained in:
Daniel Hládek 2023-05-16 15:18:01 +02:00
parent a7d048c952
commit a8f5b149f2

View File

@ -172,7 +172,6 @@ def fetch_page(link:str)->(str,str):
html = None html = None
if response is not None : if response is not None :
good = True good = True
print(response)
if response.status != 200: if response.status != 200:
good = False good = False
LOGGER.error('not a 200 response: %s for URL %s', response.status, url) LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
@ -212,8 +211,16 @@ def extract_page(final_link,html):
if html is not None: if html is not None:
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True) doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
if doc is not None: if doc is not None:
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE: lines = doc["text"].split("\n")
# filter out tables
good_lines = []
for line in lines:
if line.startswith("|") or line.startswith("1 2 3 4") or line.startswith("12345"):
continue
good_lines.append(line)
doc["text"] = "\n".join(good_lines)
# text too small # text too small
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
doc = None doc = None
return doc return doc
@ -642,7 +649,8 @@ def parseurl(link):
print(rules.crawl_delay("*")) print(rules.crawl_delay("*"))
html = trafilatura.fetch_url(link,decode=True) html = trafilatura.fetch_url(link,decode=True)
get_bs_links(link,html) get_bs_links(link,html)
doc = trafilatura.bare_extraction(html) doc = extract_page(link,html)
if doc is not None:
import pprint import pprint
pprint.pprint(doc) pprint.pprint(doc)
internal_links, external_links = get_bs_links(link,html) internal_links, external_links = get_bs_links(link,html)
@ -719,6 +727,21 @@ def visit(hostname,filter_content=True):
def crawl_summary(): def crawl_summary():
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
contentcol = db["content"]
res = contentcol.aggregate([
{"$group":{"_id":None,"total_text_size":{"$sum":"$text_size"}}}
])
print(">>>>> Total text size in content")
for item in res:
print(item)
linkscol = db["links"]
# find counts of link statuses
res = linkscol.aggregate([
{"$group":{"_id":"$status","count":{"$sum":1}}}
])
print(">>>>> Link status counts")
for item in res:
print(item["_id"],item["count"])
batchcol = db["batches"] batchcol = db["batches"]
yesterday = datetime.datetime.today() - datetime.timedelta(days=1) yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
print(yesterday) print(yesterday)
@ -732,6 +755,7 @@ def crawl_summary():
} }
}, },
{"$sort":{"original_text_size":-1}}, {"$sort":{"original_text_size":-1}},
{"$limit":100},
]) ])
print(">>>> Batches") print(">>>> Batches")
headers = ["_id","document_count","good_document_count","batch_size","original_text_size"] headers = ["_id","document_count","good_document_count","batch_size","original_text_size"]
@ -739,29 +763,41 @@ def crawl_summary():
for item in res: for item in res:
values = [str(item[x]) for x in headers] values = [str(item[x]) for x in headers]
print("\t".join(values)) print("\t".join(values))
contentcol = db["content"]
res = contentcol.aggregate([
{"$group":{"_id":None,total_text_size:{"$sum":"$text_size"}}}
])
print(">>>>> Total text size in content")
for item in res:
print(res)
def extr(hdoc):
url = hdoc["url"]
html = binascii.a2b_qp(hdoc["quoted_html"])
doc = extract_page(url,html)
return doc
def import_html(): def import_html():
myclient= pymongo.MongoClient(CONNECTION) myclient= pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
for l in sys.stdin: contentcol = db["content"]
buffer = []
counter = 0
for i,l in enumerate(sys.stdin):
hdoc = json.loads(l) hdoc = json.loads(l)
url = hdoc["url"] url = hdoc["url"]
# beautifusoup is to unify encoding r = contentcol.find_one({"url":url},projection=["_id"])
html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify() if r is not None:
doc = extract_page(url,html) print(">>>>" + str(i) + " copy: " + url)
if doc is not None: continue
print("------=====-") buffer.append(hdoc)
print(doc) if len(buffer) < 128:
status = index_page(db,url,url,html,doc) continue
print(status) from multiprocessing import Pool
with Pool(8) as p:
outs = p.map(extr,buffer)
for hdoc,doc in zip(buffer,outs):
if doc is None:
print("bad html" + hdoc["url"])
continue
status = index_page(db,hdoc["url"],hdoc["url"],hdoc["quoted_html"],doc)
counter += 1
print( ">>> " + str(counter) + " " + hdoc["url"] + " " + status)
del buffer[:]
def sample_domains(): def sample_domains():
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)