This commit is contained in:
Daniel Hládek 2023-05-16 15:18:01 +02:00
parent a7d048c952
commit a8f5b149f2

View File

@ -172,7 +172,6 @@ def fetch_page(link:str)->(str,str):
html = None
if response is not None :
good = True
print(response)
if response.status != 200:
good = False
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
@ -212,8 +211,16 @@ def extract_page(final_link,html):
if html is not None:
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
if doc is not None:
lines = doc["text"].split("\n")
# filter out tables
good_lines = []
for line in lines:
if line.startswith("|") or line.startswith("1 2 3 4") or line.startswith("12345"):
continue
good_lines.append(line)
doc["text"] = "\n".join(good_lines)
# text too small
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
# text too small
doc = None
return doc
@ -642,12 +649,13 @@ def parseurl(link):
print(rules.crawl_delay("*"))
html = trafilatura.fetch_url(link,decode=True)
get_bs_links(link,html)
doc = trafilatura.bare_extraction(html)
import pprint
pprint.pprint(doc)
internal_links, external_links = get_bs_links(link,html)
print(internal_links)
print(external_links)
doc = extract_page(link,html)
if doc is not None:
import pprint
pprint.pprint(doc)
internal_links, external_links = get_bs_links(link,html)
print(internal_links)
print(external_links)
@ -719,6 +727,21 @@ def visit(hostname,filter_content=True):
def crawl_summary():
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
contentcol = db["content"]
res = contentcol.aggregate([
{"$group":{"_id":None,"total_text_size":{"$sum":"$text_size"}}}
])
print(">>>>> Total text size in content")
for item in res:
print(item)
linkscol = db["links"]
# find counts of link statuses
res = linkscol.aggregate([
{"$group":{"_id":"$status","count":{"$sum":1}}}
])
print(">>>>> Link status counts")
for item in res:
print(item["_id"],item["count"])
batchcol = db["batches"]
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
print(yesterday)
@ -732,6 +755,7 @@ def crawl_summary():
}
},
{"$sort":{"original_text_size":-1}},
{"$limit":100},
])
print(">>>> Batches")
headers = ["_id","document_count","good_document_count","batch_size","original_text_size"]
@ -739,29 +763,41 @@ def crawl_summary():
for item in res:
values = [str(item[x]) for x in headers]
print("\t".join(values))
contentcol = db["content"]
res = contentcol.aggregate([
{"$group":{"_id":None,total_text_size:{"$sum":"$text_size"}}}
])
print(">>>>> Total text size in content")
for item in res:
print(res)
def extr(hdoc):
url = hdoc["url"]
html = binascii.a2b_qp(hdoc["quoted_html"])
doc = extract_page(url,html)
return doc
def import_html():
myclient= pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
for l in sys.stdin:
contentcol = db["content"]
buffer = []
counter = 0
for i,l in enumerate(sys.stdin):
hdoc = json.loads(l)
url = hdoc["url"]
# beautifusoup is to unify encoding
html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
doc = extract_page(url,html)
if doc is not None:
print("------=====-")
print(doc)
status = index_page(db,url,url,html,doc)
print(status)
r = contentcol.find_one({"url":url},projection=["_id"])
if r is not None:
print(">>>>" + str(i) + " copy: " + url)
continue
buffer.append(hdoc)
if len(buffer) < 128:
continue
from multiprocessing import Pool
with Pool(8) as p:
outs = p.map(extr,buffer)
for hdoc,doc in zip(buffer,outs):
if doc is None:
print("bad html" + hdoc["url"])
continue
status = index_page(db,hdoc["url"],hdoc["url"],hdoc["quoted_html"],doc)
counter += 1
print( ">>> " + str(counter) + " " + hdoc["url"] + " " + status)
del buffer[:]
def sample_domains():
myclient = pymongo.MongoClient(CONNECTION)