This commit is contained in:
Daniel Hládek 2023-04-13 16:11:19 +02:00
parent 37a115fb94
commit 8e8d4b9625
2 changed files with 104 additions and 86 deletions

View File

@ -1,6 +1,8 @@
import click import click
import mongocrawler import mongocrawler
import rq import rq
import redis
import sys
import os import os
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/") REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")

View File

@ -154,21 +154,16 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
# exceptions happening here # exceptions happening here
return rules return rules
def extract_page(final_link,html):
doc = None
if html is not None:
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
if doc is not None:
if not "text" in doc or len(doc["text"]) < MINTEXTSIZE:
# text too small
doc = None
return doc
def extract_pages(link_batch:list,responses:list)->list:
out = []
for original_link,(final_link,html) in zip(link_batch,responses):
doc = None
assert original_link is not None
if html is not None:
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
if doc is not None:
if not "text" in doc or len(doc["text"]) < MINTEXTSIZE:
# text too small
doc = None
out.append((original_link,final_link,html,doc))
return out
def set_content_checksums(doc): def set_content_checksums(doc):
text = doc["text"] text = doc["text"]
@ -186,85 +181,82 @@ def set_content_checksums(doc):
sentences += 1 sentences += 1
doc["sentences_count"] = sentences doc["sentences_count"] = sentences
def index_pages(db,hostname,extracted_pages): def index_page(db,original_link,final_link,html,doc):
linkcol = db["links"] linkcol = db["links"]
htmlcol = db["html"] htmlcol = db["html"]
contentcol = db["content"] contentcol = db["content"]
checkcol = db["check"] checkcol = db["check"]
links = [] state = "good"
# stats of the batch link = original_link
good_document_count = 0 if original_link != final_link:
document_count = 0 print(original_link,final_link)
text_size = 0 linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
good_text_size = 0 link = final_link
original_text_size = 0 if html is None:
for original_link,final_link,html,doc in extracted_pages: state = "html_error"
document_count += 1 elif doc is None:
state = "good" state = "content_error"
link = original_link if doc is not None:
if original_link != final_link: set_content_checksums(doc)
print(original_link,final_link) tsz = doc["text_size"]
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}}) psz = doc["paragraph_sizes_sum"]
link = final_link if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
if html is None: state = "small"
state = "html_error" # check copy
elif doc is None: if state == "good":
state = "content_error" origsz = 0
if doc is not None: for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
set_content_checksums(doc) # index paragraph checksums
tsz = doc["text_size"] nd = checkcol.find_one({"_id":chs})
text_size += tsz if nd is None:
psz = doc["paragraph_sizes_sum"] origsz += paragraph_size
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO: doc["original_text_size"] = origsz
state = "small"
# check copy
if state == "good":
origsz = 0
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
# index paragraph checksums
nd = checkcol.find_one({"_id":chs})
if nd is None:
origsz += paragraph_size
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO: if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
state = "copy" state = "copy"
original_text_size += origsz print(origsz)
print(origsz) if state == "good":
htdoc = get_link_doc(link,state)
htdoc["html"] = html
htdoc["html_size"] = len(html)
htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest()
# can be revisited - upsert
del htdoc["url"]
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
doc.update(get_link_doc(link,"good"))
# todo extract links
print(link,doc)
del doc["url"]
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
for chs in doc["paragraph_checksums"]:
try:
checkcol.insert_one({"_id":chs})
except pymongo.errors.DuplicateKeyError as err:
pass
linkdoc = get_link_doc(link,state)
del linkdoc["url"]
linkcol.update_one({"url":link},{"$set":linkdoc})
return state
def save_batch_info(db,host,states,docs):
good_document_count = 0
original_text_size = 0
batch_size = 0
_,domain = courlan.get_hostinfo(host)
for state,doc in zip(states,docs):
batch_size += 1
if state == "good": if state == "good":
good_document_count += 1 good_document_count += 1
good_text_size += doc["text_size"] original_text_size += doc["original_text_size"]
htdoc = get_link_doc(link,state)
htdoc["html"] = html
htdoc["html_size"] = len(html)
htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest()
# can be revisited - upsert
del htdoc["url"]
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
doc.update(get_link_doc(link,"good"))
# todo extract links
print(link,doc)
del doc["url"]
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
for chs in doc["paragraph_checksums"]:
try:
checkcol.insert_one({"_id":chs})
except pymongo.errors.DuplicateKeyError as err:
pass
linkdoc = get_link_doc(link,state)
del linkdoc["url"]
linkcol.update_one({"url":link},{"$set":linkdoc})
batchdoc = { batchdoc = {
"host": linkdoc["host"], "host": host,
"domain": linkdoc["domain"], "domain": domain,
"created_at": dat.utcnow(), "created_at": dat.utcnow(),
"good_document_count":good_document_count, "good_document_count":good_document_count,
"document_count":document_count,
"text_size":text_size,
"good_text_size":good_text_size,
"original_text_size":original_text_size, "original_text_size":original_text_size,
"batch_size": BATCHSIZE, "good_prob": good_document_count / batch_size,
"average_fetch_characters": text_size / BATCHSIZE, "batch_size": batch_size,
} }
db["batches"].insert_one(batchdoc) db["batches"].insert_one(batchdoc)
print(batchdoc) print(batchdoc)
@ -699,10 +691,23 @@ def visit(hostname):
responses = [] responses = []
for link in links: for link in links:
responses.append(fetch_page(link)) responses.append(fetch_page(link))
extracted_pages = extract_pages(links,responses)
extracted_pages = []
for original_link,(final_link,html) in zip(links,responses):
doc = None
assert original_link is not None
doc = extract_page(final_link,html)
extracted_pages.append((original_link,final_link,html,doc))
extracted_links = extract_links(links,responses,hostname,rules,"frontlink") extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
index_links(db,extracted_links) index_links(db,extracted_links)
index_pages(db,hostname,extracted_pages) final_states = []
docs = []
for original_link,final_link,html,doc in extracted_pages:
status = index_page(db,original_link,final_link,html,doc)
final_states.append(status)
docs.append(doc)
save_batch_info(db,hostname,final_states,docs)
link_summary(db,hostname) link_summary(db,hostname)
def crawl_summary(): def crawl_summary():
@ -719,17 +724,28 @@ def crawl_summary():
"batch_count":{"$sum":"$batch_size"}, "batch_count":{"$sum":"$batch_size"},
"text_size":{"$sum":"$text_size"}, "text_size":{"$sum":"$text_size"},
"original_text_size":{"$sum":"$original_text_size"}, "original_text_size":{"$sum":"$original_text_size"},
"count":{"$sum":1},
} }
}, },
{"$sort":{"original_text_size":-1}},
]) ])
print(">>>> Batches") print(">>>> Batches")
headers = ["_id","document_count","good_document_count","count","batch_count","text_size","original_text_size"] headers = ["_id","document_count","good_document_count","batch_count","text_size","original_text_size"]
print("\t".join(headers)) print("\t".join(headers))
for item in res: for item in res:
values = [str(item[x]) for x in headers] values = [str(item[x]) for x in headers]
print("\t".join(values)) print("\t".join(values))
import binascii
def import_html():
myclient = pymongo.MongoClient(CONNECTION)
for l in sys.stdin:
hdoc = json.loads(l)
url = hdoc["url"]
html = bs4.BeautifulSoup(binascii.b2a_qp(hdoc["quoted_html"])).prettify()
doc = extract_pages(url,html)
index_page(db,url,url,html,doc)
def sample_domains(): def sample_domains():
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]