zz
This commit is contained in:
parent
3413b1a190
commit
b8850819b9
@ -175,7 +175,7 @@ def fetch_page(link:str)->(str,str):
|
|||||||
good = True
|
good = True
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
good = False
|
good = False
|
||||||
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
LOGGER.error('not a 200 response: %s for URL %s', response.status, link)
|
||||||
elif response.data is None or len(response.data) < MIN_FILE_SIZE:
|
elif response.data is None or len(response.data) < MIN_FILE_SIZE:
|
||||||
LOGGER.error('too small/incorrect for URL %s', link)
|
LOGGER.error('too small/incorrect for URL %s', link)
|
||||||
good = False
|
good = False
|
||||||
@ -278,7 +278,7 @@ def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content
|
|||||||
htdoc = get_link_doc(link,state)
|
htdoc = get_link_doc(link,state)
|
||||||
htdoc["html"] = html
|
htdoc["html"] = html
|
||||||
htdoc["html_size"] = len(html)
|
htdoc["html_size"] = len(html)
|
||||||
htdoc["html_md5"]= hashlib.md5(html).hexdigest()
|
htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest()
|
||||||
# can be revisited - upsert
|
# can be revisited - upsert
|
||||||
del htdoc["url"]
|
del htdoc["url"]
|
||||||
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
||||||
@ -682,6 +682,16 @@ def classify(start_link):
|
|||||||
cl.train(trainset)
|
cl.train(trainset)
|
||||||
cl.test(testset)
|
cl.test(testset)
|
||||||
|
|
||||||
|
def index_pages(hostname,extracted_pages):
|
||||||
|
final_states = []
|
||||||
|
docs = []
|
||||||
|
for original_link,final_link,html,doc in extracted_pages:
|
||||||
|
status = index_page(db,original_link,final_link,html,doc,filter_content)
|
||||||
|
final_states.append(status)
|
||||||
|
docs.append(doc)
|
||||||
|
save_batch_info(db,hostname,final_states,docs)
|
||||||
|
|
||||||
|
|
||||||
def visit(hostname,filter_content=True):
|
def visit(hostname,filter_content=True):
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
@ -712,6 +722,7 @@ def visit(hostname,filter_content=True):
|
|||||||
extracted_pages.append((original_link,final_link,html,doc))
|
extracted_pages.append((original_link,final_link,html,doc))
|
||||||
|
|
||||||
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
|
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
|
||||||
|
|
||||||
index_links(db,extracted_links)
|
index_links(db,extracted_links)
|
||||||
final_states = []
|
final_states = []
|
||||||
docs = []
|
docs = []
|
||||||
|
Loading…
Reference in New Issue
Block a user