zz
This commit is contained in:
parent
b8850819b9
commit
c06348080a
@ -168,7 +168,6 @@ def fetch_page(link:str)->(str,str):
|
||||
print(link)
|
||||
final_link = link
|
||||
response = trafilatura.fetch_response(link,decode=False)
|
||||
print(response)
|
||||
time.sleep(2)
|
||||
html = None
|
||||
if response is not None :
|
||||
@ -242,21 +241,21 @@ def set_content_checksums(doc):
|
||||
sentences += 1
|
||||
doc["sentences_count"] = sentences
|
||||
|
||||
def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content=True):
|
||||
def index_page(db,original_link:str,html:bytes,doc,filter_content=True):
|
||||
linkcol = db["links"]
|
||||
htmlcol = db["html"]
|
||||
contentcol = db["content"]
|
||||
checkcol = db["check"]
|
||||
state = "good"
|
||||
link = original_link
|
||||
if original_link != final_link:
|
||||
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
|
||||
link = final_link
|
||||
if html is None:
|
||||
state = "html_error"
|
||||
elif doc is None:
|
||||
state = "content_error"
|
||||
if doc is not None:
|
||||
if original_link != doc["url"]:
|
||||
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
|
||||
link = doc["url"]
|
||||
set_content_checksums(doc)
|
||||
tsz = doc["text_size"]
|
||||
psz = doc["paragraph_sizes_sum"]
|
||||
@ -318,7 +317,7 @@ def save_batch_info(db,host,states,docs):
|
||||
db["batches"].insert_one(batchdoc)
|
||||
|
||||
|
||||
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
|
||||
def extract_links(link_batch:list,responses:list,rules,default_status="frontlink")->list:
|
||||
links = {}
|
||||
badrobot = 0
|
||||
for original_link,(final_link,html) in zip(link_batch,responses):
|
||||
@ -483,7 +482,7 @@ def fetch_sitemap_links(start_link):
|
||||
def fetch_front_links(start_link,rules):
|
||||
start_link,hostname = courlan.check_url(start_link)
|
||||
response = fetch_page(start_link)
|
||||
extracted_links = extract_links([start_link],[response],hostname,rules,"frontlink")
|
||||
extracted_links = extract_links([start_link],[response],rules,"frontlink")
|
||||
print("Fetched {} frontlinks".format(len(extracted_links)))
|
||||
return extracted_links
|
||||
|
||||
@ -682,11 +681,11 @@ def classify(start_link):
|
||||
cl.train(trainset)
|
||||
cl.test(testset)
|
||||
|
||||
def index_pages(hostname,extracted_pages):
|
||||
def index_pages(db,hostname,extracted_pages,filter_content):
|
||||
final_states = []
|
||||
docs = []
|
||||
for original_link,final_link,html,doc in extracted_pages:
|
||||
status = index_page(db,original_link,final_link,html,doc,filter_content)
|
||||
for original_link,html,doc in extracted_pages:
|
||||
status = index_page(db,original_link,html,doc,filter_content)
|
||||
final_states.append(status)
|
||||
docs.append(doc)
|
||||
save_batch_info(db,hostname,final_states,docs)
|
||||
@ -713,24 +712,16 @@ def visit(hostname,filter_content=True):
|
||||
responses = []
|
||||
for link in links:
|
||||
responses.append(fetch_page(link))
|
||||
|
||||
extracted_pages = []
|
||||
for original_link,(final_link,html) in zip(links,responses):
|
||||
doc = None
|
||||
assert original_link is not None
|
||||
doc = extract_page(final_link,html)
|
||||
extracted_pages.append((original_link,final_link,html,doc))
|
||||
extracted_pages.append((original_link,html,doc))
|
||||
|
||||
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
|
||||
|
||||
index_links(db,extracted_links)
|
||||
final_states = []
|
||||
docs = []
|
||||
for original_link,final_link,html,doc in extracted_pages:
|
||||
status = index_page(db,original_link,final_link,html,doc,filter_content)
|
||||
final_states.append(status)
|
||||
docs.append(doc)
|
||||
save_batch_info(db,hostname,final_states,docs)
|
||||
index_pages(db,hostname,extracted_pages,filter_content)
|
||||
extracted_links = extract_links(links,responses,rules,"frontlink")
|
||||
index_links(db, extracted_links)
|
||||
link_summary(db,hostname)
|
||||
|
||||
def crawl_summary():
|
||||
@ -803,7 +794,7 @@ def import_html():
|
||||
if doc is None:
|
||||
print("bad html" + hdoc["url"])
|
||||
continue
|
||||
status = index_page(db,hdoc["url"],hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc)
|
||||
status = index_page(db,hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc)
|
||||
counter += 1
|
||||
print( ">>> " + str(counter) + " " + str(i) + " " + hdoc["url"] + " " + status)
|
||||
del buffer[:]
|
||||
|
Loading…
Reference in New Issue
Block a user