zz

2024-03-06 18:44:12 +01:00 · 2024-03-06 18:44:12 +01:00 · 3bdac3642b
commit 3bdac3642b
parent 87f84b8eb8 c06348080a
1 changed files with 26 additions and 13 deletions
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@ -167,14 +167,14 @@ def fetch_page(link:str)->(str,str):
    print("fetching:::::")
    print(link)
    final_link = link
-    response = trafilatura.fetch_url(link,decode=False)
+    response = trafilatura.fetch_response(link,decode=False)
    time.sleep(2)
    html = None
    if response is not None :
        good = True
        if response.status != 200:
            good = False
-            LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
+            LOGGER.error('not a 200 response: %s for URL %s', response.status, link)
        elif response.data is None or len(response.data) < MIN_FILE_SIZE:
            LOGGER.error('too small/incorrect for URL %s', link)
            good = False
@ -183,7 +183,7 @@ def fetch_page(link:str)->(str,str):
            good = False
            LOGGER.error('too large: length %s for URL %s', len(response.data), link)
        if good:
-            html = trafilatura.utils.decode_response(response) 
+            html = trafilatura.utils.decode_file(response.data) 
        if html is not None:
            html, final_link = trafilatura.spider.refresh_detection(html, final_link)
            # is there a meta-refresh on the page?
@ -241,21 +241,21 @@ def set_content_checksums(doc):
            sentences += 1
    doc["sentences_count"] = sentences

-def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content=True):
+def index_page(db,original_link:str,html:bytes,doc,filter_content=True):
    linkcol = db["links"]
    htmlcol = db["html"]
    contentcol = db["content"]
    checkcol = db["check"]
    state = "good"
    link = original_link
-    if original_link != final_link:
-        linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
-        link = final_link
    if html is None:
        state = "html_error"
    elif doc is None:
        state = "content_error"
    if doc is not None:
+        if original_link != doc["url"]:
+            linkcol.update_one({"url":original_link},{"$set":{"status":"redirect"}})
+            link = doc["url"]    
        set_content_checksums(doc)
        tsz = doc["text_size"]
        psz = doc["paragraph_sizes_sum"]
@ -277,7 +277,7 @@ def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content
        htdoc = get_link_doc(link,state)
        htdoc["html"] = html
        htdoc["html_size"] = len(html)
-        htdoc["html_md5"]= hashlib.md5(html).hexdigest()
+        htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest()
        # can be revisited - upsert
        del htdoc["url"]
        htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
@ -317,7 +317,7 @@ def save_batch_info(db,host,states,docs):
    db["batches"].insert_one(batchdoc)


-def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
+def extract_links(link_batch:list,responses:list,rules,default_status="frontlink")->list:
    links = {}
    badrobot = 0
    for original_link,(final_link,html) in zip(link_batch,responses):
@ -482,7 +482,7 @@ def fetch_sitemap_links(start_link):
 def fetch_front_links(start_link,rules):
    start_link,hostname = courlan.check_url(start_link)
    response = fetch_page(start_link)
-    extracted_links = extract_links([start_link],[response],hostname,rules,"frontlink")
+    extracted_links = extract_links([start_link],[response],rules,"frontlink")
    print("Fetched {} frontlinks".format(len(extracted_links)))
    return extracted_links

@ -682,6 +682,16 @@ def classify(start_link):
    cl.test(testset)


+def index_pages(db,hostname,extracted_pages,filter_content):
+    final_states = []
+    docs = []
+    for original_link,html,doc in extracted_pages:
+        status = index_page(db,original_link,html,doc,filter_content)
+        final_states.append(status)
+        docs.append(doc)
+    save_batch_info(db,hostname,final_states,docs)
+ 
+
 def visit(hostname,filter_content=True):
    myclient = pymongo.MongoClient(CONNECTION)
    db=myclient[DBNAME]
@ -703,14 +713,14 @@ def visit(hostname,filter_content=True):
    responses = []
    for link in links:
        responses.append(fetch_page(link))
-
    extracted_pages = []
    for original_link,(final_link,html) in zip(links,responses):
        doc = None
        assert original_link is not None
        doc = extract_page(final_link,html)
-        extracted_pages.append((original_link,final_link,html,doc))
+        extracted_pages.append((original_link,html,doc))

+<<<<<<< HEAD
    extracted_links = extract_links(links,responses,hostname,rules,"frontlink")

    index_links(db,extracted_links)
@ -721,6 +731,9 @@ def visit(hostname,filter_content=True):
        final_states.append(status)
        docs.append(doc)
    save_batch_info(db,hostname,final_states,docs)
+    index_pages(db,hostname,extracted_pages,filter_content)
+    extracted_links = extract_links(links,responses,rules,"frontlink")
+    index_links(db, extracted_links)
    link_summary(db,hostname)

 def crawl_summary():
@ -793,7 +806,7 @@ def import_html():
            if doc is None:
                print("bad html" + hdoc["url"])
                continue
-            status = index_page(db,hdoc["url"],hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc)
+            status = index_page(db,hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc)
            counter += 1
            print( ">>> " + str(counter) + " " + str(i) + " " + hdoc["url"] + " " + status)
        del buffer[:]