From a26613ebb156dfaf4d8cbddb8107e895b8ddd3b3 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Wed, 5 Apr 2023 17:09:42 +0200 Subject: [PATCH 1/4] zz --- mongo/mongocwarler.py | 288 ++++++++++++++++++++---------------------- 1 file changed, 140 insertions(+), 148 deletions(-) diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 17e4553..f22903e 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -18,11 +18,11 @@ import re import time import collections import math -import json +import random LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") DOMAIN = os.getenv("SUCKER_DOMAIN","sk") -BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",10) +BATCHSIZE=int(os.getenv("SUCKER_BATCHSIZE","10")) CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") DBNAME=os.getenv("SUCKER_DBNAME","crawler") MINFILESIZE=300 @@ -102,39 +102,34 @@ def get_link_doc(link,status="frontlink"): return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()} -def fetch_pages(link_batch): - htmls = [] - #print(link_batch) - #print("zzzzzzzzzz") - for link in link_batch: - print("fetching:::::") - print(link) - final_link = link - response = trafilatura.fetch_url(link,decode=False) - time.sleep(2) - html = None - if response is not None : - good = True - if response.status != 200: - good = False - LOGGER.error('not a 200 response: %s for URL %s', response.status, url) - elif response.data is None or len(response.data) < MINFILESIZE: - LOGGER.error('too small/incorrect for URL %s', link) - good = False - # raise error instead? - elif len(response.data) > MAXFILESIZE: - good = False - LOGGER.error('too large: length %s for URL %s', len(response.data), link) - if good: - html = trafilatura.utils.decode_response(response) - final_link = response.url - if html is not None: - html, final_link = trafilatura.spider.refresh_detection(html, final_link) - # is there a meta-refresh on the page? - if final_link is None: # malformed or malicious content - html = None - htmls.append((final_link,html)) - return htmls +def fetch_page(link): + print("fetching:::::") + print(link) + final_link = link + response = trafilatura.fetch_url(link,decode=False) + time.sleep(2) + html = None + if response is not None : + good = True + if response.status != 200: + good = False + LOGGER.error('not a 200 response: %s for URL %s', response.status, url) + elif response.data is None or len(response.data) < MINFILESIZE: + LOGGER.error('too small/incorrect for URL %s', link) + good = False + # raise error instead? + elif len(response.data) > MAXFILESIZE: + good = False + LOGGER.error('too large: length %s for URL %s', len(response.data), link) + if good: + html = trafilatura.utils.decode_response(response) + final_link = response.url + if html is not None: + html, final_link = trafilatura.spider.refresh_detection(html, final_link) + # is there a meta-refresh on the page? + if final_link is None: # malformed or malicious content + html = None + return final_link,html def fetch_robot(base_url): try: @@ -227,7 +222,7 @@ def index_pages(db,hostname,extracted_pages): checkcol.insert_one({"_id":chs}) except pymongo.errors.DuplicateKeyError as err: pass - linkcol.update_one({"url":original_link},{"$set":{"status":state}}) + linkcol.update_one({"url":link},{"$set":{"status":state}}) def extract_links(link_batch,responses,hostname,rules,default_status="frontlink"): @@ -263,23 +258,28 @@ def index_links(db,extracted_links): for link,status in extracted_links: if not is_link_good(link): continue - doc = get_link_doc(link,status) - try: - linkcol.insert_one(doc) - except pymongo.errors.DuplicateKeyError as ex: - pass + if status == "frontlink" or status == "backlink": + doc = get_link_doc(link,status) + try: + linkcol.insert_one(doc) + # dont overwrite + except pymongo.errors.DuplicateKeyError as ex: + pass + else: + print("updating " + link,status) + linkcol.update_one({"url":link},{"$set":{"status":status,"updated_at":datetime.utcnow()}}) def get_link_features(link): a, urlpath = courlan.get_host_and_path(link) features = re.split("[/?&]",urlpath) #features = re.split("[/?-_=]",urlpath) res = [] - for feature in features: + for i,feature in enumerate(features): if len(feature) < 1: continue if feature.isdigit(): feature = "" - res.append(feature) + res.append(str(i)+ "-" + feature) if len(res) < 2: return None res = res[:-1] @@ -295,11 +295,9 @@ class LinkClassifier: self.bad_count = 0 self.alpha = 0.001 - def train(self,db,hostname): - linkcol = db["links"] - res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}}) + def train(self,links): testset = [] - for i,item in enumerate(res): + for i,item in enumerate(links): link = item["url"] state = item["status"] cl = 0 @@ -336,9 +334,13 @@ class LinkClassifier: print("MISS",l,cl,pcp) print("Accuracy:") print(len(testset)) - print(gg / len(testset)) + acc = gg / len(testset) + print(acc) + return acc def classify(self,link): + if self.good_count + self.bad_count == 0: + return random.uniform(-0.1,0.1) features = get_link_features(link) res = 0 gp = math.log(self.good_count) - math.log(self.good_count + self.bad_count) @@ -363,51 +365,15 @@ class LinkClassifier: return pa - pb - def get_links(db,hostname,status,batch_size): linkcol = db["links"] - # count downloaded links - res = linkcol.aggregate([ - { "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } }, - {"$group":{"_id":None, - "count":{"$count":{}}, - } - }, - ]) - links = set() - out = list(res) - if len(out) == 0: - return list() - if out[0]["count"] < 200: - #res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size) - # get random links - res = linkcol.aggregate([ - { "$match": { "status": status,"host":hostname } }, - { "$sample": { "size": batch_size } } - ]) - for i,doc in enumerate(res): - #print(">>>>>" + status) - #print(doc); - links.add(doc["url"]) - if i >= batch_size: - break - else: - cl = LinkClassifier() - cl.train(db,hostname) - res = linkcol.aggregate([ - { "$match": { "status": status,"host":hostname } }, - { "$sample": { "size": batch_size * 100 } } - ]) - outlinks = [] - for i,doc in enumerate(res): - #print(">>>>>" + status) - #print(doc); - link = doc["url"] - outlinks.append((doc["url"],cl.classify(link))) - outlinks = sorted(outlinks, key=lambda x: x[1],reverse=True) - links = [l[0] for l in outlinks[0:batch_size]] - # todo remove very bad links from database - return list(links) + res = linkcol.find({"host":hostname,"status":status},limit=batch_size) + links = [] + for item in res: + links.append(item["url"]) + print("Got {} {}".format(len(links),status)) + return links + def fetch_sitemap_links(start_link): @@ -415,42 +381,48 @@ def fetch_sitemap_links(start_link): navigation_links = trafilatura.sitemaps.sitemap_search(start_link,target_lang=LANGUAGE) for link in navigation_links: out.append((link,"frontlink")) + print("Fetched {} sitemap links".format(len(out))) return out -def process_links(db,hostname,status,links=[],rules=None,batch_size=BATCHSIZE): - #print(links) - responses = fetch_pages(links) - #print(responses) - extracted_pages = extract_pages(links,responses) - #print(extracted_pages) - extracted_links = extract_links(links,responses,hostname,rules,status) - #print(extracted_links) - index_links(db,extracted_links) - index_pages(db,hostname,extracted_pages) +def fetch_front_links(start_link,rules): + start_link,hostname = courlan.check_url(start_link) + response = fetch_page(start_link) + extracted_links = extract_links([start_link],[response],hostname,rules,"frontlink") + print("Fetched {} frontlinks".format(len(extracted_links))) + return extracted_links + def link_summary(db,hostname): linkcol = db["links"] #res = linkcol.distinct("hostname",{"hostname":hostname}) - - # count links res = linkcol.aggregate([ {"$match":{"host":hostname}}, - {"$group":{"_id":"$status","count":{"$sum":1}}}, + {"$group":{"_id":"$status", + "count":{"$count":{}}, + } + }, ]) badcount = 0 goodcount = 0 - out = ["good","frontlink","backlink"] info = {} + crawled_count = 0 for item in res: - if item["_id"] not in out: - badcount += item["count"] - if item["_id"] == "good": - goodcount = item["count"] - info[item["_id"]] = item["count"] - good_prob = goodcount / (goodcount + badcount) + count = item["count"] + st = item["_id"] + print(st,count) + if st == "good": + goodcount += count + if st != "frontlink" and st != "backlink": + crawled_count += count + info[st] = count + baclink_cout = 0 + if "backlink" in info: + backlink_count = info["backlink"] + good_prob= 0 + if crawled_count > 0: + good_prob = goodcount / crawled_count info["good_prob"] = good_prob - info["bad_documents"] = badcount print(">>>Domain Content") contentcol = db["content"] res = contentcol.aggregate([ @@ -465,35 +437,53 @@ def link_summary(db,hostname): for item in res: text_size = item["text_size_sum"] good_document_characters = 0 + fetch_average_characters = 0 if goodcount > 0: good_document_characters = text_size / goodcount - fetch_average_characters = text_size / (goodcount + badcount) + fetch_average_characters = text_size / crawled_count info["total_good_characters"] = text_size info["average_good_characters"] = good_document_characters info["average_fetch_characters"] = fetch_average_characters domaincol = db["domain"] - if goodcount + badcount > 100: - cl = LinkClassifier() - cl.train(db,hostname) - res = linkcol.aggregate([ - { "$match": { "status": "backlink","host":hostname } }, - { "$sample": { "size": BATCHSIZE * 100 } } - ]) - predicted_good = 0 - predicted_bad = 0 - for item in res: - cll = cl.classify(item["url"]) - if cll > 0: - predicted_good += 1 - else: - predicted_bad += 1 - predicted_good_prob = 0 - if predicted_good + predicted_bad > 0: - predicted_good_prob = predicted_good / (predicted_good + predicted_bad) - info["predicted_good_prob"] = predicted_good_prob print(info) domaincol.update_one({"host":hostname},{"$set":info},upsert=True) +def sample_links(db,hostname,status,batch_size): + print("Getting backlinks") + linkcol = db["links"] + res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}}) + cl = LinkClassifier() + crawled_links = list(res) + crawled_count = len(crawled_links) + min_train_size = 200 + prediction_accuracy = 0 + if crawled_count > min_train_size: + # train on crawled links + prediction_accuracy = cl.train(crawled_links) + sample_set_size = 10000 + res = linkcol.find({"host":hostname,"status": status},limit = sample_set_size) + sample_links = [] + predicted_good = 0 + for item in res: + for item in res: + cll = cl.classify(item["url"]) + sample_links.append((item["url"],cll)) + if cll > 0: + predicted_good += 1 + # TODO frontlinks are not unique! + sample_links.sort(key=lambda x: x[1],reverse=True) + predicted_good_prob = predicted_good / len(sample_links) + domaincol = db["domain"] + info = { + "predicted_good_prob":predicted_good_prob, + "prediction_accuracy": prediction_accuracy, + "crawled_count": crawled_count, + } + print(info) + domaincol.update_one({"host":hostname},{"$set":info}) + links = [l[0] for l in sample_links[0:batch_size]] + return links + def domain_summary(db,hostname): linkcol = db["links"] #res = linkcol.distinct("hostname",{"hostname":hostname}) @@ -572,23 +562,25 @@ def visit(start_link): db=myclient[DBNAME] start_link,hostname = courlan.check_url(start_link) batch_size = BATCHSIZE - - print("Getting frontlinks") - links = get_links(db,hostname,"frontlink",batch_size) - print(f"Got {len(links)} frontlinks") - if len(links) < batch_size: - print("Fetching sitemap links") - sitemap_links = fetch_sitemap_links(start_link) - index_links(db,sitemap_links) - links = get_links(db,hostname,"frontlink",batch_size) - links.insert(0,start_link) - if len(links) < batch_size: - back_links = get_links(db,hostname,"backlink",batch_size - len(links)) - links += back_links - - print("Processing links") rules = fetch_robot(hostname) - responses = fetch_pages(links) + # renew front links + sitemap_links = fetch_sitemap_links(start_link) + index_links(db,sitemap_links) + front_links = fetch_front_links(start_link,rules) + index_links(db,front_links) + # start crawling + # frontlinks first + links = sample_links(db,hostname,"frontlink",batch_size) + links.insert(0,start_link) + # then backlinks + if len(links) < batch_size: + back_links = sample_links(db,hostname,"backlink",batch_size - len(links)) + links += back_links + # index results + print("Processing links") + responses = [] + for link in links: + responses.append(fetch_page(link)) extracted_pages = extract_pages(links,responses) extracted_links = extract_links(links,responses,hostname,rules,"backlink") index_links(db,extracted_links) From 9a9e8da4cfbe165d961146ba13d65c152dbbc57e Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Thu, 6 Apr 2023 12:15:33 +0200 Subject: [PATCH 2/4] zz --- mongo/mongocwarler.py | 72 +++++++++++++++++++++++++------------------ mongo/mongoindexer.py | 15 +++++++++ 2 files changed, 57 insertions(+), 30 deletions(-) diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index f22903e..db781a0 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -19,6 +19,7 @@ import time import collections import math import random +import hashlib LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") DOMAIN = os.getenv("SUCKER_DOMAIN","sk") @@ -32,17 +33,15 @@ CHECK_PARAGRAPH_SIZE=150 TEXT_TRASH_SIZE=200 TEXT_TRASH_RATIO=0.6 -def put_queue(db,channel,message): - queuecol = db["queue"] - queuecol.insert_one({"channel":channel,"message":message,"created_at":datetime.utcnow(),"started_at":None}) - -def reserve_queue(db,channel,message): - queuecol = db["queue"] - r = queuecol.find_one_and_delete({"channel":channel},sort={"created_at":-1}) - -def delete_queue(db,channel): - queuecol = db["queue"] - pass +def split_train(res): + trainset = [] + testset = [] + for i,item in enumerate(res): + if i % 10 == 0: + testset.append(item) + else: + trainset.append(item) + return trainset,testset def calculate_checksums(text): """ @@ -181,6 +180,7 @@ def index_pages(db,hostname,extracted_pages): text = doc["text"] checksums,sizes = calculate_checksums(text) doc["text_size"] = len(text) + doc["text_md5"] = hashlib.md5(text.encode("utf8")).hexdigest() doc["paragraph_checksums"] = checksums doc["paragraph_sizes"] = sizes goodsz = sum(sizes) @@ -209,6 +209,7 @@ def index_pages(db,hostname,extracted_pages): htdoc = get_link_doc(link,state) htdoc["html"] = html htdoc["html_size"] = len(html) + htdoc["html_md5"]= hashlib.md5(html.encode("utf8")).hexdigest() # can be revisited - upsert del htdoc["url"] htmlcol.update_one({"url":link},{"$set":htdoc},upsert=True) @@ -296,7 +297,6 @@ class LinkClassifier: self.alpha = 0.001 def train(self,links): - testset = [] for i,item in enumerate(links): link = item["url"] state = item["status"] @@ -304,9 +304,6 @@ class LinkClassifier: if state == "good": cl = 1 print(cl,state,link) - if i % 10 == 1: - testset.append((link,cl)) - continue features = get_link_features(link) if features is None: continue @@ -321,9 +318,15 @@ class LinkClassifier: self.badcounter[feature] += 1 self.bdictsize = len(self.badcounter) self.gdictsize = len(self.goodcounter) + + def test(self,testset): # eval gg = 0 - for l,cl in testset: + for item in testset: + l = item["url"] + cl = 0 + if item["status"] == "good": + cl = 1 pcp = self.classify(l) r = 0 if pcp > 0: @@ -339,7 +342,7 @@ class LinkClassifier: return acc def classify(self,link): - if self.good_count + self.bad_count == 0: + if self.good_count == 0 or self.bad_count == 0: return random.uniform(-0.1,0.1) features = get_link_features(link) res = 0 @@ -352,17 +355,14 @@ class LinkClassifier: goodprob = 0 badprob = 0 for feature in features: - g = math.log((self.goodcounter[feature] + self.alpha)) - gcc + g = math.log((self.goodcounter[feature] + self.alpha)) - gcc goodprob += g b = math.log(self.badcounter[feature] + self.alpha) - bcc badprob += b print(feature,g,b) - if (goodprob + gp) > (badprob + bp): - #if goodprob > badprob: - res = 1 pa = math.exp(goodprob + gp) pb = math.exp(badprob + bp) - return pa - pb + return pa - pb #+ random.uniform(-0.001,0.001) def get_links(db,hostname,status,batch_size): @@ -445,8 +445,9 @@ def link_summary(db,hostname): info["average_good_characters"] = good_document_characters info["average_fetch_characters"] = fetch_average_characters domaincol = db["domain"] - print(info) domaincol.update_one({"host":hostname},{"$set":info},upsert=True) + res = domaincol.find_one({"host":hostname}) + print(res) def sample_links(db,hostname,status,batch_size): print("Getting backlinks") @@ -455,11 +456,12 @@ def sample_links(db,hostname,status,batch_size): cl = LinkClassifier() crawled_links = list(res) crawled_count = len(crawled_links) - min_train_size = 200 prediction_accuracy = 0 - if crawled_count > min_train_size: + if crawled_count > 200: # train on crawled links - prediction_accuracy = cl.train(crawled_links) + trainset,testset = split_train(crawled_links) + cl.train(trainset) + prediction_accuracy = cl.test(testset) sample_set_size = 10000 res = linkcol.find({"host":hostname,"status": status},limit = sample_set_size) sample_links = [] @@ -467,12 +469,15 @@ def sample_links(db,hostname,status,batch_size): for item in res: for item in res: cll = cl.classify(item["url"]) + cll += random.uniform(-0.1,0.1) sample_links.append((item["url"],cll)) if cll > 0: predicted_good += 1 # TODO frontlinks are not unique! sample_links.sort(key=lambda x: x[1],reverse=True) - predicted_good_prob = predicted_good / len(sample_links) + predicted_good_prob = 0 + if len(sample_links) > 0: + predicted_good_prob = predicted_good / len(sample_links) domaincol = db["domain"] info = { "predicted_good_prob":predicted_good_prob, @@ -507,11 +512,13 @@ def createdb(): linkcol.create_index("url",unique=True) linkcol.create_index("host") contentcol = db["content"] - contentcol.create_index("url",unique=True) + contentcol.create_index("url") + contentcol.create_index("text_md5",unique=True) #contentcol.create_index({"paragraph_checksums":1}) contentcol.create_index("host") htmlcol = db["html"] - htmlcol.create_index("url",unique=True) + htmlcol.create_index("url") + htmlcol.create_index("html_md5",unique=True) domaincol = db["domains"] domaincol.create_index("host",unique=True) @@ -553,7 +560,12 @@ def classify(start_link): db=myclient[DBNAME] start_link,hostname = courlan.check_url(start_link) cl = LinkClassifier() - cl.train(db,hostname) + linkcol = db["links"] + res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}}) + trainset, testset = split_train(res) + + cl.train(trainset) + cl.test(testset) @cli.command() @click.argument("start_link") diff --git a/mongo/mongoindexer.py b/mongo/mongoindexer.py index 3b5f6e5..323e321 100644 --- a/mongo/mongoindexer.py +++ b/mongo/mongoindexer.py @@ -8,3 +8,18 @@ mycol = mydb["customers"] mydict = {"text":"ahoj svet"} x = mycol.insert_one(mydict) + +def createdb(): + myclient = pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] + linkcol = db["links"] + linkcol.create_index("url",unique=True) + linkcol.create_index("host") + contentcol = db["content"] + contentcol.create_index("url",unique=True) + #contentcol.create_index({"paragraph_checksums":1}) + contentcol.create_index("host") + htmlcol = db["html"] + htmlcol.create_index("url") + domaincol = db["domains"] + domaincol.create_index("host",unique=True) From 2de0da85a61a920b77f06ea51fbc240ac062538f Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Thu, 6 Apr 2023 12:26:50 +0200 Subject: [PATCH 3/4] zz --- mongo/Dockerfile | 6 ++++++ mongo/requirements.txt | 5 +++++ 2 files changed, 11 insertions(+) create mode 100644 mongo/Dockerfile create mode 100644 mongo/requirements.txt diff --git a/mongo/Dockerfile b/mongo/Dockerfile new file mode 100644 index 0000000..21df1d3 --- /dev/null +++ b/mongo/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.9 +RUN mkdir /app +COPY requirements.txt /app +RUN pip install -r requirements.txt +COPY *.py /app +WORKDIR /app diff --git a/mongo/requirements.txt b/mongo/requirements.txt new file mode 100644 index 0000000..ef7e65e --- /dev/null +++ b/mongo/requirements.txt @@ -0,0 +1,5 @@ +trafilatura +courlan +pymongo +click +lxml From ddbe79848d5225a5093e36cf8c8a5faddbcb4063 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Thu, 6 Apr 2023 13:21:34 +0200 Subject: [PATCH 4/4] zz --- mongo/Dockerfile | 3 ++- mongo/{mongocwarler.py => mongocrawler.py} | 23 ++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) rename mongo/{mongocwarler.py => mongocrawler.py} (96%) diff --git a/mongo/Dockerfile b/mongo/Dockerfile index 21df1d3..3dfd9ce 100644 --- a/mongo/Dockerfile +++ b/mongo/Dockerfile @@ -1,6 +1,7 @@ FROM python:3.9 RUN mkdir /app COPY requirements.txt /app -RUN pip install -r requirements.txt +RUN pip install -r /app/requirements.txt COPY *.py /app WORKDIR /app +ENTRYPOINT ["python", "./mongocrawler.py"] diff --git a/mongo/mongocwarler.py b/mongo/mongocrawler.py similarity index 96% rename from mongo/mongocwarler.py rename to mongo/mongocrawler.py index db781a0..9eb738d 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocrawler.py @@ -278,8 +278,7 @@ def get_link_features(link): for i,feature in enumerate(features): if len(feature) < 1: continue - if feature.isdigit(): - feature = "" + feature = re.sub("[0-9]","*",feature) res.append(str(i)+ "-" + feature) if len(res) < 2: return None @@ -322,6 +321,9 @@ class LinkClassifier: def test(self,testset): # eval gg = 0 + true_positive = 0 + positive = 0 + false_negative = 0 for item in testset: l = item["url"] cl = 0 @@ -331,12 +333,19 @@ class LinkClassifier: r = 0 if pcp > 0: r = 1 + if cl == 1: + if r == 1: + true_positive += 1 + positive += 1 + if r == 1 and cl == 0: + false_negative += 1 if r == cl: gg += 1 else: print("MISS",l,cl,pcp) - print("Accuracy:") print(len(testset)) + print("Precision: {}, Recall: {}".format(true_positive/positive,true_positive/(true_positive+false_negative))) + print("Accuracy:") acc = gg / len(testset) print(acc) return acc @@ -407,6 +416,7 @@ def link_summary(db,hostname): goodcount = 0 info = {} crawled_count = 0 + bad_crawl_count = 0 for item in res: count = item["count"] st = item["_id"] @@ -415,7 +425,11 @@ def link_summary(db,hostname): goodcount += count if st != "frontlink" and st != "backlink": crawled_count += count + if st != "good": + bad_crawl_count += count info[st] = count + info["crawled_count"] = crawled_count + info["bad_crawl_count"] = bad_crawl_count baclink_cout = 0 if "backlink" in info: backlink_count = info["backlink"] @@ -469,7 +483,7 @@ def sample_links(db,hostname,status,batch_size): for item in res: for item in res: cll = cl.classify(item["url"]) - cll += random.uniform(-0.1,0.1) + #cll += random.uniform(-0.1,0.1) sample_links.append((item["url"],cll)) if cll > 0: predicted_good += 1 @@ -521,6 +535,7 @@ def createdb(): htmlcol.create_index("html_md5",unique=True) domaincol = db["domains"] domaincol.create_index("host",unique=True) + domaincol.create_index("average_fetch_characters",unique=True) @cli.command() @click.argument("link")