zz
This commit is contained in:
parent
3413b1a190
commit
b8850819b9
@ -175,7 +175,7 @@ def fetch_page(link:str)->(str,str):
|
||||
good = True
|
||||
if response.status != 200:
|
||||
good = False
|
||||
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
||||
LOGGER.error('not a 200 response: %s for URL %s', response.status, link)
|
||||
elif response.data is None or len(response.data) < MIN_FILE_SIZE:
|
||||
LOGGER.error('too small/incorrect for URL %s', link)
|
||||
good = False
|
||||
@ -278,7 +278,7 @@ def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content
|
||||
htdoc = get_link_doc(link,state)
|
||||
htdoc["html"] = html
|
||||
htdoc["html_size"] = len(html)
|
||||
htdoc["html_md5"]= hashlib.md5(html).hexdigest()
|
||||
htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest()
|
||||
# can be revisited - upsert
|
||||
del htdoc["url"]
|
||||
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
||||
@ -682,6 +682,16 @@ def classify(start_link):
|
||||
cl.train(trainset)
|
||||
cl.test(testset)
|
||||
|
||||
def index_pages(hostname,extracted_pages):
|
||||
final_states = []
|
||||
docs = []
|
||||
for original_link,final_link,html,doc in extracted_pages:
|
||||
status = index_page(db,original_link,final_link,html,doc,filter_content)
|
||||
final_states.append(status)
|
||||
docs.append(doc)
|
||||
save_batch_info(db,hostname,final_states,docs)
|
||||
|
||||
|
||||
def visit(hostname,filter_content=True):
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
@ -712,6 +722,7 @@ def visit(hostname,filter_content=True):
|
||||
extracted_pages.append((original_link,final_link,html,doc))
|
||||
|
||||
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
|
||||
|
||||
index_links(db,extracted_links)
|
||||
final_states = []
|
||||
docs = []
|
||||
|
Loading…
Reference in New Issue
Block a user