diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py
index 53c830a..4b49508 100644
--- a/mongo/mongocwarler.py
+++ b/mongo/mongocwarler.py
@@ -1,23 +1,45 @@
 import pymongo
-
-
 import trafilatura
 import trafilatura.feeds
 import trafilatura.sitemaps
 import trafilatura.spider
 import sys
 
-def index_page(db,url,content,extracted_page):
-    htmlldb = db["html"]
-    htmldb.insert_one({"url":ulr,"content":content})
-    contentdb = db["content"]
-    contentdb.insert_one(extracted_page)
-    pass
+
+def calculate_checksums(self, text):
+    """
+    @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
+    """
+    checksums = []
+    sizes = []
+    hval = 0
+    hsz = 0
+    sz = 0
+    for c in text:
+        cv = ord(c)
+        sz += 1
+        if cv > 64:
+            hval += (hval << 3) + cv
+            zv = hval >> 31
+            hval &= 0x7fffffff
+            hval += zv
+            hsz += 1
+        if c == "\n" and hsz > 0:
+            if hsz > 100:
+                checksums.append(hval)
+                sizes.append(sz)
+            sz = 0
+            hsz = 0
+    if hsz > 100:
+        checksums.append(hval)
+        sizes.append(sz)
+    return checksums, sizes
+
 
 def fetch_pages(link_batch):
     docs  = []
     for link in link_batch:
-        link_doc = {"url":link,"status": "unvisited"}
+        link_doc = {"url":link,"status": "html_error"}
         rr = trafilatura.fetch_url(page,decode=True)
         if rr is not None:
             link_doc["status"] = "html_ok"
@@ -39,14 +61,36 @@ def extract_pages(link_docs):
     return content, extracted_links
 
 def index_pages(pagedb,pages_list):
-    mycol = pagedb["content"]
+    contentcol = pagedb["content"]
     for page in page_list:
         # get paragraph checksums
-        checksums = get_checksums(page["text"])
-        page["checksums"] = checksums
-    x = mycol.insert_many(pages_list)
+        checksums,sizes = get_checksums(page["text"])
+        page["paragraph_checksums"] = checksums
+        page["paragraph_sizes"] = sizes
+    x = contentcol.insert_many(pages_list)
     page_hashes = []
-        pass
+
+def process_pages(db,fetched_pages):
+    content_pages, extracted_links = extract_pages(fetched_pages)
+    contentcol = db["content"]
+    contentcol.insertMany(content_pages)
+    linkcol = db["links"]
+    extracted = []
+    for link in extracted_links:
+        extracted.append({"url":link,"status":"backlink"})
+    # shuld fail if link already exists
+    linkcol.insertMany(extracted)
+
+    html_pages = []
+    for page in fetched_pages:
+        linkcol.updateOne({"url":page["url"]},{"$set":{"status":"visited"}})
+        if "html" in page:
+            html_pages.append({"url":page["url"],"html":page["html"],"update_time":datetime.now()})
+            del page["html"]
+    htmlcol = db["html"]
+    htmlcol.insertMany(html_pages)
+
+
 
 def get_visited_links(domain):
     return []