diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py
index 608b0b7..2e077b3 100644
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@@ -241,7 +241,7 @@ def set_content_checksums(doc):
             sentences += 1
     doc["sentences_count"] = sentences
 
-def index_page(db,original_link,final_link,html,doc,filter_content=True):
+def index_page(db,original_link:str,final_link:str,html:bytes,doc,filter_content=True):
     linkcol = db["links"]
     htmlcol = db["html"]
     contentcol = db["content"]
@@ -277,7 +277,7 @@ def index_page(db,original_link,final_link,html,doc,filter_content=True):
         htdoc = get_link_doc(link,state)
         htdoc["html"] = html
         htdoc["html_size"] = len(html)
-        htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest()
+        htdoc["html_md5"]= hashlib.md5(html).hexdigest()
         # can be revisited - upsert
         del htdoc["url"]
         htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True)
@@ -347,7 +347,7 @@ def index_links(db,extracted_links):
     for link,status in extracted_links:
         if not is_link_good(link):
             continue
-        if status == "frontlink" or status == "backlink":
+        if status == "frontlink" :
             doc = get_link_doc(link,status)
             try:
                 linkcol.insert_one(doc)
@@ -509,7 +509,7 @@ def link_summary(db,hostname):
         print(st,count)
         if st == "good":
             goodcount += count
-        if st != "frontlink" and st != "backlink": 
+        if st != "frontlink": 
             crawled_count += count
             if st != "good":
                 bad_crawl_count += count
@@ -517,8 +517,6 @@ def link_summary(db,hostname):
     info["crawled_count"] = crawled_count
     info["bad_crawl_count"] = bad_crawl_count
     baclink_cout = 0
-    if "backlink" in info:
-        backlink_count = info["backlink"]
     good_prob= 0
     if crawled_count > 0:
         good_prob = goodcount / crawled_count
@@ -552,7 +550,7 @@ def link_summary(db,hostname):
 def sample_links(db,hostname,status,batch_size):
     print("Sampling links")
     linkcol = db["links"]
-    res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}})
+    res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
     cl = LinkClassifier()
     crawled_links = list(res)
     crawled_count = len(crawled_links)
@@ -678,7 +676,7 @@ def classify(start_link):
     start_link,hostname = courlan.check_url(start_link)
     cl = LinkClassifier()
     linkcol = db["links"]
-    res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}})
+    res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
     trainset, testset = split_train(res)
 
     cl.train(trainset)
@@ -764,7 +762,7 @@ def crawl_summary():
         values = [str(item[x]) for x in headers]
         print("\t".join(values))
 
-def extr(hdoc):
+def _extr(hdoc):
     url = hdoc["url"]
     html = binascii.a2b_qp(hdoc["quoted_html"])
     doc =  extract_page(url,html)
@@ -773,30 +771,31 @@ def extr(hdoc):
 def import_html():
     myclient= pymongo.MongoClient(CONNECTION)
     db=myclient[DBNAME]
-    contentcol = db["content"]
+    linkscol = db["links"]
     buffer = []
     counter = 0
     for i,l in enumerate(sys.stdin):
         hdoc = json.loads(l)
         url = hdoc["url"]
-        r = contentcol.find_one({"url":url},projection=["_id"])
-        if r is not None:
+        r = linkscol.find_one({"url":url})
+        if r is not None and r["status"] != "frontlink":
             print(">>>>" +  str(i) + " copy: " + url)
             continue
         buffer.append(hdoc)
         if len(buffer) < 128:
             continue
         from multiprocessing import Pool
+        outs = []
         with Pool(8) as p:
-            outs = p.map(extr,buffer)
-            for hdoc,doc in zip(buffer,outs):
-                if doc is None:
-                    print("bad html" + hdoc["url"])
-                    continue
-                status = index_page(db,hdoc["url"],hdoc["url"],hdoc["quoted_html"],doc)
-                counter += 1
-                print( ">>> " + str(counter) + " " + hdoc["url"] + " " + status)
-            del buffer[:]
+            outs = p.map(_extr,buffer)
+        for hdoc,doc in zip(buffer,outs):
+            if doc is None:
+                print("bad html" + hdoc["url"])
+                continue
+            status = index_page(db,hdoc["url"],hdoc["url"], binascii.a2b_qp(hdoc["quoted_html"]),doc)
+            counter += 1
+            print( ">>> " + str(counter) + " " + str(i) + " " + hdoc["url"] + " " + status)
+        del buffer[:]
 
 
 def sample_domains():