This commit is contained in:
Daniel Hládek 2024-03-06 12:11:31 +01:00
parent 3413b1a190
commit b8850819b9

View File

@ -175,7 +175,7 @@ def fetch_page(link:str)->(str,str):
good = True
if response.status != 200:
good = False
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
LOGGER.error('not a 200 response: %s for URL %s', response.status, link)
elif response.data is None or len(response.data) < MIN_FILE_SIZE:
LOGGER.error('too small/incorrect for URL %s', link)
good = False
@ -278,7 +278,7 @@ def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content
htdoc = get_link_doc(link,state)
htdoc["html"] = html
htdoc["html_size"] = len(html)
htdoc["html_md5"]= hashlib.md5(html).hexdigest()
htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest()
# can be revisited - upsert
del htdoc["url"]
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
@ -682,6 +682,16 @@ def classify(start_link):
cl.train(trainset)
cl.test(testset)
def index_pages(hostname,extracted_pages):
final_states = []
docs = []
for original_link,final_link,html,doc in extracted_pages:
status = index_page(db,original_link,final_link,html,doc,filter_content)
final_states.append(status)
docs.append(doc)
save_batch_info(db,hostname,final_states,docs)
def visit(hostname,filter_content=True):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
@ -712,6 +722,7 @@ def visit(hostname,filter_content=True):
extracted_pages.append((original_link,final_link,html,doc))
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
index_links(db,extracted_links)
final_states = []
docs = []