zz
This commit is contained in:
commit
3bdac3642b
@ -167,14 +167,14 @@ def fetch_page(link:str)->(str,str):
|
|||||||
print("fetching:::::")
|
print("fetching:::::")
|
||||||
print(link)
|
print(link)
|
||||||
final_link = link
|
final_link = link
|
||||||
response = trafilatura.fetch_url(link,decode=False)
|
response = trafilatura.fetch_response(link,decode=False)
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
html = None
|
html = None
|
||||||
if response is not None :
|
if response is not None :
|
||||||
good = True
|
good = True
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
good = False
|
good = False
|
||||||
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
LOGGER.error('not a 200 response: %s for URL %s', response.status, link)
|
||||||
elif response.data is None or len(response.data) < MIN_FILE_SIZE:
|
elif response.data is None or len(response.data) < MIN_FILE_SIZE:
|
||||||
LOGGER.error('too small/incorrect for URL %s', link)
|
LOGGER.error('too small/incorrect for URL %s', link)
|
||||||
good = False
|
good = False
|
||||||
@ -183,7 +183,7 @@ def fetch_page(link:str)->(str,str):
|
|||||||
good = False
|
good = False
|
||||||
LOGGER.error('too large: length %s for URL %s', len(response.data), link)
|
LOGGER.error('too large: length %s for URL %s', len(response.data), link)
|
||||||
if good:
|
if good:
|
||||||
html = trafilatura.utils.decode_response(response)
|
html = trafilatura.utils.decode_file(response.data)
|
||||||
if html is not None:
|
if html is not None:
|
||||||
html, final_link = trafilatura.spider.refresh_detection(html, final_link)
|
html, final_link = trafilatura.spider.refresh_detection(html, final_link)
|
||||||
# is there a meta-refresh on the page?
|
# is there a meta-refresh on the page?
|
||||||
@ -241,21 +241,21 @@ def set_content_checksums(doc):
|
|||||||
sentences += 1
|
sentences += 1
|
||||||
doc["sentences_count"] = sentences
|
doc["sentences_count"] = sentences
|
||||||
|
|
||||||
def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content=True):
|
def index_page(db,original_link:str,html:bytes,doc,filter_content=True):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
htmlcol = db["html"]
|
htmlcol = db["html"]
|
||||||
contentcol = db["content"]
|
contentcol = db["content"]
|
||||||
checkcol = db["check"]
|
checkcol = db["check"]
|
||||||
state = "good"
|
state = "good"
|
||||||
link = original_link
|
link = original_link
|
||||||
if original_link != final_link:
|
|
||||||
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
|
|
||||||
link = final_link
|
|
||||||
if html is None:
|
if html is None:
|
||||||
state = "html_error"
|
state = "html_error"
|
||||||
elif doc is None:
|
elif doc is None:
|
||||||
state = "content_error"
|
state = "content_error"
|
||||||
if doc is not None:
|
if doc is not None:
|
||||||
|
if original_link != doc["url"]:
|
||||||
|
linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
|
||||||
|
link = doc["url"]
|
||||||
set_content_checksums(doc)
|
set_content_checksums(doc)
|
||||||
tsz = doc["text_size"]
|
tsz = doc["text_size"]
|
||||||
psz = doc["paragraph_sizes_sum"]
|
psz = doc["paragraph_sizes_sum"]
|
||||||
@ -277,7 +277,7 @@ def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content
|
|||||||
htdoc = get_link_doc(link,state)
|
htdoc = get_link_doc(link,state)
|
||||||
htdoc["html"] = html
|
htdoc["html"] = html
|
||||||
htdoc["html_size"] = len(html)
|
htdoc["html_size"] = len(html)
|
||||||
htdoc["html_md5"]= hashlib.md5(html).hexdigest()
|
htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest()
|
||||||
# can be revisited - upsert
|
# can be revisited - upsert
|
||||||
del htdoc["url"]
|
del htdoc["url"]
|
||||||
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
|
||||||
@ -317,7 +317,7 @@ def save_batch_info(db,host,states,docs):
|
|||||||
db["batches"].insert_one(batchdoc)
|
db["batches"].insert_one(batchdoc)
|
||||||
|
|
||||||
|
|
||||||
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
|
def extract_links(link_batch:list,responses:list,rules,default_status="frontlink")->list:
|
||||||
links = {}
|
links = {}
|
||||||
badrobot = 0
|
badrobot = 0
|
||||||
for original_link,(final_link,html) in zip(link_batch,responses):
|
for original_link,(final_link,html) in zip(link_batch,responses):
|
||||||
@ -482,7 +482,7 @@ def fetch_sitemap_links(start_link):
|
|||||||
def fetch_front_links(start_link,rules):
|
def fetch_front_links(start_link,rules):
|
||||||
start_link,hostname = courlan.check_url(start_link)
|
start_link,hostname = courlan.check_url(start_link)
|
||||||
response = fetch_page(start_link)
|
response = fetch_page(start_link)
|
||||||
extracted_links = extract_links([start_link],[response],hostname,rules,"frontlink")
|
extracted_links = extract_links([start_link],[response],rules,"frontlink")
|
||||||
print("Fetched {} frontlinks".format(len(extracted_links)))
|
print("Fetched {} frontlinks".format(len(extracted_links)))
|
||||||
return extracted_links
|
return extracted_links
|
||||||
|
|
||||||
@ -682,6 +682,16 @@ def classify(start_link):
|
|||||||
cl.test(testset)
|
cl.test(testset)
|
||||||
|
|
||||||
|
|
||||||
|
def index_pages(db,hostname,extracted_pages,filter_content):
|
||||||
|
final_states = []
|
||||||
|
docs = []
|
||||||
|
for original_link,html,doc in extracted_pages:
|
||||||
|
status = index_page(db,original_link,html,doc,filter_content)
|
||||||
|
final_states.append(status)
|
||||||
|
docs.append(doc)
|
||||||
|
save_batch_info(db,hostname,final_states,docs)
|
||||||
|
|
||||||
|
|
||||||
def visit(hostname,filter_content=True):
|
def visit(hostname,filter_content=True):
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
@ -703,14 +713,14 @@ def visit(hostname,filter_content=True):
|
|||||||
responses = []
|
responses = []
|
||||||
for link in links:
|
for link in links:
|
||||||
responses.append(fetch_page(link))
|
responses.append(fetch_page(link))
|
||||||
|
|
||||||
extracted_pages = []
|
extracted_pages = []
|
||||||
for original_link,(final_link,html) in zip(links,responses):
|
for original_link,(final_link,html) in zip(links,responses):
|
||||||
doc = None
|
doc = None
|
||||||
assert original_link is not None
|
assert original_link is not None
|
||||||
doc = extract_page(final_link,html)
|
doc = extract_page(final_link,html)
|
||||||
extracted_pages.append((original_link,final_link,html,doc))
|
extracted_pages.append((original_link,html,doc))
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
|
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
|
||||||
|
|
||||||
index_links(db,extracted_links)
|
index_links(db,extracted_links)
|
||||||
@ -721,6 +731,9 @@ def visit(hostname,filter_content=True):
|
|||||||
final_states.append(status)
|
final_states.append(status)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
save_batch_info(db,hostname,final_states,docs)
|
save_batch_info(db,hostname,final_states,docs)
|
||||||
|
index_pages(db,hostname,extracted_pages,filter_content)
|
||||||
|
extracted_links = extract_links(links,responses,rules,"frontlink")
|
||||||
|
index_links(db, extracted_links)
|
||||||
link_summary(db,hostname)
|
link_summary(db,hostname)
|
||||||
|
|
||||||
def crawl_summary():
|
def crawl_summary():
|
||||||
@ -793,7 +806,7 @@ def import_html():
|
|||||||
if doc is None:
|
if doc is None:
|
||||||
print("bad html" + hdoc["url"])
|
print("bad html" + hdoc["url"])
|
||||||
continue
|
continue
|
||||||
status = index_page(db,hdoc["url"],hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc)
|
status = index_page(db,hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc)
|
||||||
counter += 1
|
counter += 1
|
||||||
print( ">>> " + str(counter) + " " + str(i) + " " + hdoc["url"] + " " + status)
|
print( ">>> " + str(counter) + " " + str(i) + " " + hdoc["url"] + " " + status)
|
||||||
del buffer[:]
|
del buffer[:]
|
||||||
|
Loading…
Reference in New Issue
Block a user