diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py
index 62b0409..d10808f 100644
--- a/mongo/mongocwarler.py
+++ b/mongo/mongocwarler.py
@@ -184,7 +184,6 @@ def index_pages(db,hostname,extracted_pages):
             doc["paragraph_checksums"] = checksums
             doc["paragraph_sizes"] = sizes
             goodsz = sum(sizes)
-            doc["paragraph_sizes_sum"] = goodsz
             # Not enough larger paragraphs
             if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
                 state = "trash"
@@ -263,9 +262,54 @@ def index_links(db,extracted_links):
         except pymongo.errors.DuplicateKeyError as ex:
             pass
 
+def get_link_features(link):
+    a, urlpath = courlan.get_host_and_path(link)
+    features = urlpath.split("/?-_")
+    if len(features) < 2:
+        return None
+    # drop last part
+    features = features[:-1]
+    return features
+
+
+def link_classifier(db,hostname,batch_size):
+    res = linkcol.aggregate([
+        { "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } },
+        { "$sample": { "size": 2000 } }
+    ])
+    goodcounter = collections.Counter()
+    badcounter = collections.Counter()
+    for item in res:
+        link = res["url"]
+        state = res["status"]
+        cl = 0
+        if state == "good":
+            cl = 1
+        features = get_link_features(link)
+        if features is None:
+            continue
+        lf = len(features)
+        for feature in features:
+            if state == "good":
+                goodcounter[feature] += 1/lf
+            else:
+                badcounter[feature] += 1/lf
+        tf = goodcounter.keys() + bacounter.keys()
+        allcounter = collections.Counter()
+        for key in tf:
+            gc = goodcounter[key]
+            bc = badcounter[key]
+            p = gc / (gc + bc)
+            allcounter[key] = p
+        return allcounter
+
+
+
+
 def get_links(db,hostname,status,batch_size):
     linkcol = db["links"]
     #res  = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
+    # get random links
     res = linkcol.aggregate([
         { "$match": { "status": status,"host":hostname } },
         { "$sample": { "size": batch_size } }
@@ -317,13 +361,22 @@ def link_summary(db,hostname):
         #{"$project": {"textsum":{"$sum":"$text_size"}}}
         {"$group":{"_id":None,
                    "text_size_sum":{"$sum":"$text_size"},
-                   "paragraph_size_sum":{"$sum":"$paragraph_sizes_sum"}
                    }
          },
     ])
     for item in res:
         print(item)
 
+def domain_summary(db,hostname):
+    linkcol = db["links"]
+    #res = linkcol.distinct("hostname",{"hostname":hostname})
+    
+    # count links
+    res = linkcol.aggregate([
+        {"$group":{"_id":"$hostname","text_size_sum":{"$sum":"$text_size"}}},
+    ])
+    for item in res:
+        print(item)
 
 @click.group()
 def cli():