zz

2023-03-05 18:53:14 +01:00 · 2023-03-05 18:53:14 +01:00 · 1e3f8dcba6
commit 1e3f8dcba6
parent 1752d5c776
1 changed files with 58 additions and 14 deletions
--- a/mongo/mongocwarler.py
+++ b/mongo/mongocwarler.py
@ -1,23 +1,45 @@
 import pymongo
-
-
 import trafilatura
 import trafilatura.feeds
 import trafilatura.sitemaps
 import trafilatura.spider
 import sys

-def index_page(db,url,content,extracted_page):
-    htmlldb = db["html"]
-    htmldb.insert_one({"url":ulr,"content":content})
-    contentdb = db["content"]
-    contentdb.insert_one(extracted_page)
-    pass
+
+def calculate_checksums(self, text):
+    """
+    @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
+    """
+    checksums = []
+    sizes = []
+    hval = 0
+    hsz = 0
+    sz = 0
+    for c in text:
+        cv = ord(c)
+        sz += 1
+        if cv > 64:
+            hval += (hval << 3) + cv
+            zv = hval >> 31
+            hval &= 0x7fffffff
+            hval += zv
+            hsz += 1
+        if c == "\n" and hsz > 0:
+            if hsz > 100:
+                checksums.append(hval)
+                sizes.append(sz)
+            sz = 0
+            hsz = 0
+    if hsz > 100:
+        checksums.append(hval)
+        sizes.append(sz)
+    return checksums, sizes
+

 def fetch_pages(link_batch):
    docs  = []
    for link in link_batch:
-        link_doc = {"url":link,"status": "unvisited"}
+        link_doc = {"url":link,"status": "html_error"}
        rr = trafilatura.fetch_url(page,decode=True)
        if rr is not None:
            link_doc["status"] = "html_ok"
@ -39,14 +61,36 @@ def extract_pages(link_docs):
    return content, extracted_links

 def index_pages(pagedb,pages_list):
-    mycol = pagedb["content"]
+    contentcol = pagedb["content"]
    for page in page_list:
        # get paragraph checksums
-        checksums = get_checksums(page["text"])
-        page["checksums"] = checksums
-    x = mycol.insert_many(pages_list)
+        checksums,sizes = get_checksums(page["text"])
+        page["paragraph_checksums"] = checksums
+        page["paragraph_sizes"] = sizes
+    x = contentcol.insert_many(pages_list)
    page_hashes = []
-        pass
+
+def process_pages(db,fetched_pages):
+    content_pages, extracted_links = extract_pages(fetched_pages)
+    contentcol = db["content"]
+    contentcol.insertMany(content_pages)
+    linkcol = db["links"]
+    extracted = []
+    for link in extracted_links:
+        extracted.append({"url":link,"status":"backlink"})
+    # shuld fail if link already exists
+    linkcol.insertMany(extracted)
+
+    html_pages = []
+    for page in fetched_pages:
+        linkcol.updateOne({"url":page["url"]},{"$set":{"status":"visited"}})
+        if "html" in page:
+            html_pages.append({"url":page["url"],"html":page["html"],"update_time":datetime.now()})
+            del page["html"]
+    htmlcol = db["html"]
+    htmlcol.insertMany(html_pages)
+
+

 def get_visited_links(domain):
    return []