This commit is contained in:
Daniel Hládek 2024-03-06 18:44:12 +01:00
commit 3bdac3642b

View File

@ -167,14 +167,14 @@ def fetch_page(link:str)->(str,str):
print("fetching:::::") print("fetching:::::")
print(link) print(link)
final_link = link final_link = link
response = trafilatura.fetch_url(link,decode=False) response = trafilatura.fetch_response(link,decode=False)
time.sleep(2) time.sleep(2)
html = None html = None
if response is not None : if response is not None :
good = True good = True
if response.status != 200: if response.status != 200:
good = False good = False
LOGGER.error('not a 200 response: %s for URL %s', response.status, url) LOGGER.error('not a 200 response: %s for URL %s', response.status, link)
elif response.data is None or len(response.data) < MIN_FILE_SIZE: elif response.data is None or len(response.data) < MIN_FILE_SIZE:
LOGGER.error('too small/incorrect for URL %s', link) LOGGER.error('too small/incorrect for URL %s', link)
good = False good = False
@ -183,7 +183,7 @@ def fetch_page(link:str)->(str,str):
good = False good = False
LOGGER.error('too large: length %s for URL %s', len(response.data), link) LOGGER.error('too large: length %s for URL %s', len(response.data), link)
if good: if good:
html = trafilatura.utils.decode_response(response) html = trafilatura.utils.decode_file(response.data)
if html is not None: if html is not None:
html, final_link = trafilatura.spider.refresh_detection(html, final_link) html, final_link = trafilatura.spider.refresh_detection(html, final_link)
# is there a meta-refresh on the page? # is there a meta-refresh on the page?
@ -241,21 +241,21 @@ def set_content_checksums(doc):
sentences += 1 sentences += 1
doc["sentences_count"] = sentences doc["sentences_count"] = sentences
def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content=True): def index_page(db,original_link:str,html:bytes,doc,filter_content=True):
linkcol = db["links"] linkcol = db["links"]
htmlcol = db["html"] htmlcol = db["html"]
contentcol = db["content"] contentcol = db["content"]
checkcol = db["check"] checkcol = db["check"]
state = "good" state = "good"
link = original_link link = original_link
if original_link != final_link:
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
link = final_link
if html is None: if html is None:
state = "html_error" state = "html_error"
elif doc is None: elif doc is None:
state = "content_error" state = "content_error"
if doc is not None: if doc is not None:
if original_link != doc["url"]:
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
link = doc["url"]
set_content_checksums(doc) set_content_checksums(doc)
tsz = doc["text_size"] tsz = doc["text_size"]
psz = doc["paragraph_sizes_sum"] psz = doc["paragraph_sizes_sum"]
@ -277,7 +277,7 @@ def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content
htdoc = get_link_doc(link,state) htdoc = get_link_doc(link,state)
htdoc["html"] = html htdoc["html"] = html
htdoc["html_size"] = len(html) htdoc["html_size"] = len(html)
htdoc["html_md5"]= hashlib.md5(html).hexdigest() htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest()
# can be revisited - upsert # can be revisited - upsert
del htdoc["url"] del htdoc["url"]
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
@ -317,7 +317,7 @@ def save_batch_info(db,host,states,docs):
db["batches"].insert_one(batchdoc) db["batches"].insert_one(batchdoc)
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list: def extract_links(link_batch:list,responses:list,rules,default_status="frontlink")->list:
links = {} links = {}
badrobot = 0 badrobot = 0
for original_link,(final_link,html) in zip(link_batch,responses): for original_link,(final_link,html) in zip(link_batch,responses):
@ -482,7 +482,7 @@ def fetch_sitemap_links(start_link):
def fetch_front_links(start_link,rules): def fetch_front_links(start_link,rules):
start_link,hostname = courlan.check_url(start_link) start_link,hostname = courlan.check_url(start_link)
response = fetch_page(start_link) response = fetch_page(start_link)
extracted_links = extract_links([start_link],[response],hostname,rules,"frontlink") extracted_links = extract_links([start_link],[response],rules,"frontlink")
print("Fetched {} frontlinks".format(len(extracted_links))) print("Fetched {} frontlinks".format(len(extracted_links)))
return extracted_links return extracted_links
@ -682,6 +682,16 @@ def classify(start_link):
cl.test(testset) cl.test(testset)
def index_pages(db,hostname,extracted_pages,filter_content):
final_states = []
docs = []
for original_link,html,doc in extracted_pages:
status = index_page(db,original_link,html,doc,filter_content)
final_states.append(status)
docs.append(doc)
save_batch_info(db,hostname,final_states,docs)
def visit(hostname,filter_content=True): def visit(hostname,filter_content=True):
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
@ -703,14 +713,14 @@ def visit(hostname,filter_content=True):
responses = [] responses = []
for link in links: for link in links:
responses.append(fetch_page(link)) responses.append(fetch_page(link))
extracted_pages = [] extracted_pages = []
for original_link,(final_link,html) in zip(links,responses): for original_link,(final_link,html) in zip(links,responses):
doc = None doc = None
assert original_link is not None assert original_link is not None
doc = extract_page(final_link,html) doc = extract_page(final_link,html)
extracted_pages.append((original_link,final_link,html,doc)) extracted_pages.append((original_link,html,doc))
<<<<<<< HEAD
extracted_links = extract_links(links,responses,hostname,rules,"frontlink") extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
index_links(db,extracted_links) index_links(db,extracted_links)
@ -721,6 +731,9 @@ def visit(hostname,filter_content=True):
final_states.append(status) final_states.append(status)
docs.append(doc) docs.append(doc)
save_batch_info(db,hostname,final_states,docs) save_batch_info(db,hostname,final_states,docs)
index_pages(db,hostname,extracted_pages,filter_content)
extracted_links = extract_links(links,responses,rules,"frontlink")
index_links(db, extracted_links)
link_summary(db,hostname) link_summary(db,hostname)
def crawl_summary(): def crawl_summary():
@ -793,7 +806,7 @@ def import_html():
if doc is None: if doc is None:
print("bad html" + hdoc["url"]) print("bad html" + hdoc["url"])
continue continue
status = index_page(db,hdoc["url"],hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc) status = index_page(db,hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc)
counter += 1 counter += 1
print( ">>> " + str(counter) + " " + str(i) + " " + hdoc["url"] + " " + status) print( ">>> " + str(counter) + " " + str(i) + " " + hdoc["url"] + " " + status)
del buffer[:] del buffer[:]