diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 43ec1d5..1e5882e 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -15,10 +15,13 @@ import logging as LOGGER import os import pprint import re +import time +import collections +import math LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") DOMAIN = os.getenv("SUCKER_DOMAIN","sk") -BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",10) +BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",100) CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") DBNAME=os.getenv("SUCKER_DBNAME","crawler") MINFILESIZE=300 @@ -107,6 +110,7 @@ def fetch_pages(link_batch): print(link) final_link = link response = trafilatura.fetch_url(link,decode=False) + time.sleep(2) html = None if response is not None : good = True @@ -256,6 +260,8 @@ def extract_links(link_batch,responses,hostname,rules,default_status="frontlink" def index_links(db,extracted_links): linkcol=db["links"] for link,status in extracted_links: + if not is_link_good(link): + continue doc = get_link_doc(link,status) try: linkcol.insert_one(doc) @@ -264,63 +270,142 @@ def index_links(db,extracted_links): def get_link_features(link): a, urlpath = courlan.get_host_and_path(link) - features = urlpath.split("/?-_") - if len(features) < 2: - return None - # drop last part - features = features[:-1] - return features - - -def link_classifier(db,hostname,batch_size): - res = linkcol.aggregate([ - { "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } }, - { "$sample": { "size": 2000 } } - ]) - goodcounter = collections.Counter() - badcounter = collections.Counter() - for item in res: - link = res["url"] - state = res["status"] - cl = 0 - if state == "good": - cl = 1 - features = get_link_features(link) - if features is None: + features = re.split("[/?&]",urlpath) + #features = re.split("[/?-_=]",urlpath) + res = [] + for feature in features: + if len(feature) < 1: continue - lf = len(features) - for feature in features: - if state == "good": - goodcounter[feature] += 1/lf - else: - badcounter[feature] += 1/lf - tf = goodcounter.keys() + bacounter.keys() - allcounter = collections.Counter() - for key in tf: - gc = goodcounter[key] - bc = badcounter[key] - p = gc / (gc + bc) - allcounter[key] = p - return allcounter + if feature.isdigit(): + feature = "" + res.append(feature) + if len(res) < 2: + return None + res = res[:-1] + print(res) + return res +class LinkClassifier: + def __init__(self): + + self.goodcounter = collections.Counter() + self.badcounter = collections.Counter() + self.good_count = 0 + self.bad_count = 0 + self.alpha = 0.001 + + def train(self,db,hostname): + linkcol = db["links"] + res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}}) + testset = [] + for i,item in enumerate(res): + link = item["url"] + state = item["status"] + cl = 0 + if state == "good": + cl = 1 + print(cl,state,link) + if i % 10 == 1: + testset.append((link,cl)) + continue + features = get_link_features(link) + if features is None: + continue + lf = len(features) + if state == "good": + for feature in features: + self.good_count += 1 + self.goodcounter[feature] += 1 + else: + for feature in features: + self.bad_count += 1 + self.badcounter[feature] += 1 + self.bdictsize = len(self.badcounter) + self.gdictsize = len(self.goodcounter) + # eval + gg = 0 + for l,cl in testset: + pcp = self.classify(l) + r = 0 + if pcp > 0: + r = 1 + if r == cl: + gg += 1 + else: + print("MISS",l,cl,pcp) + print("Accuracy:") + print(len(testset)) + print(gg / len(testset)) + + def classify(self,link): + features = get_link_features(link) + res = 0 + gp = math.log(self.good_count) - math.log(self.good_count + self.bad_count) + bp = math.log(self.bad_count) - math.log(self.good_count + self.bad_count) + if features is None: + return math.exp(gp) - math.exp(bp) + gcc = math.log(self.gdictsize * self.alpha + self.good_count) + bcc = math.log(self.bdictsize * self.alpha + self.bad_count) + goodprob = 0 + badprob = 0 + for feature in features: + g = math.log((self.goodcounter[feature] + self.alpha)) - gcc + goodprob += g + b = math.log(self.badcounter[feature] + self.alpha) - bcc + badprob += b + print(feature,g,b) + if (goodprob + gp) > (badprob + bp): + #if goodprob > badprob: + res = 1 + pa = math.exp(goodprob + gp) + pb = math.exp(badprob + bp) + return pa - pb def get_links(db,hostname,status,batch_size): linkcol = db["links"] - #res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size) - # get random links + # count downloaded links res = linkcol.aggregate([ - { "$match": { "status": status,"host":hostname } }, - { "$sample": { "size": batch_size } } + { "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } }, + {"$group":{"_id":None, + "count":{"$count":{}}, + } + }, ]) links = set() - for i,doc in enumerate(res): - #print(">>>>>" + status) - #print(doc); - links.add(doc["url"]) - if i >= batch_size: - break + out = list(res) + if len(out) == 0: + return list() + if out[0]["count"] < 200: + #res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size) + # get random links + res = linkcol.aggregate([ + { "$match": { "status": status,"host":hostname } }, + { "$sample": { "size": batch_size } } + ]) + for i,doc in enumerate(res): + #print(">>>>>" + status) + #print(doc); + links.add(doc["url"]) + if i >= batch_size: + break + else: + cl = LinkClassifier() + cl.train(db,hostname) + res = linkcol.aggregate([ + { "$match": { "status": status,"host":hostname } }, + { "$sample": { "size": batch_size * 100 } } + ]) + outlinks = [] + for i,doc in enumerate(res): + #print(">>>>>" + status) + #print(doc); + link = doc["url"] + outlinks.append((doc["url"],cl.classify(link))) + outlinks = sorted(outlinks, key=lambda x: x[1],reverse=True) + links = [l[0] for l in outlinks[0:batch_size]] + # todo remove very bad links return list(links) @@ -352,8 +437,19 @@ def link_summary(db,hostname): {"$match":{"host":hostname}}, {"$group":{"_id":"$status","count":{"$sum":1}}}, ]) + badcount = 0 + goodcount = 0 + out = ["good","frontlink","backlink"] + info = {} for item in res: - print(item) + if item["_id"] not in out: + badcount += item["count"] + if item["_id"] == "good": + goodcount = item["count"] + info[item["_id"]] = item["count"] + good_prob = goodcount / (goodcount + badcount) + info["good_prob"] = good_prob + info["bad_documents"] = badcount print(">>>Domain Content") contentcol = db["content"] res = contentcol.aggregate([ @@ -364,8 +460,17 @@ def link_summary(db,hostname): } }, ]) + text_size = 0 for item in res: - print(item) + text_size = item["text_size_sum"] + good_document_characters = text_size / goodcount + fetch_average_characters = text_size / (goodcount + badcount) + info["total_good_characters"] = text_size + info["average_good_characters"] = good_document_characters + info["average_fetch_characters"] = fetch_average_characters + domaincol = db["domain"] + print(json.dumps(info)) + domaincol.update_one({"host":domain},{"$set":info},usert=True) def domain_summary(db,hostname): linkcol = db["links"] @@ -395,6 +500,8 @@ def createdb(): contentcol.create_index("host") htmlcol = db["html"] htmlcol.create_index("url",unique=True) + domaincol = db["domains"] + domaincol.create_index("host",unique=True) @cli.command() @click.argument("link") @@ -427,6 +534,14 @@ def externaldomains(link): for d in domains: print(d) +@cli.command() +@click.argument("start_link") +def classify(start_link): + myclient = pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] + start_link,hostname = courlan.check_url(start_link) + cl = LinkClassifier() + cl.train(db,hostname) @cli.command() @click.argument("start_link") @@ -443,15 +558,19 @@ def visit(start_link): print("Fetching sitemap links") sitemap_links = fetch_sitemap_links(start_link) index_links(db,sitemap_links) - links.append(start_link) + links = get_links(db,hostname,"frontlink",batch_size) + links.insert(0,start_link) + if len(links) < batch_size: + back_links = get_links(db,hostname,"backlink",batch_size - len(links)) + links += back_links - print("Processing frontlinks") + print("Processing links") rules = fetch_robot(hostname) - process_links(db,hostname,"frontlink",links,rules) - print("Getting backlinks") - back_links = get_links(db,hostname,"backlink",batch_size) - print("Processing backlinks") - process_links(db,hostname,"backlink",back_links,rules=rules) + responses = fetch_pages(links) + extracted_pages = extract_pages(links,responses) + extracted_links = extract_links(links,responses,hostname,rules,"backlink") + index_links(db,extracted_links) + index_pages(db,hostname,extracted_pages) link_summary(db,hostname) if __name__ == "__main__":