From 69236bb58d101b0103a26059c236d46398caca83 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Mon, 3 Apr 2023 16:37:01 +0200 Subject: [PATCH] zz --- mongo/mongocwarler.py | 186 +++++++++++++++++++++++++++++++----------- 1 file changed, 138 insertions(+), 48 deletions(-) diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index d10808f..833ec86 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -15,10 +15,13 @@ import logging as LOGGER import os import pprint import re +import time +import collections +import math LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") DOMAIN = os.getenv("SUCKER_DOMAIN","sk") -BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",10) +BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",100) CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") DBNAME=os.getenv("SUCKER_DBNAME","crawler") MINFILESIZE=300 @@ -107,6 +110,7 @@ def fetch_pages(link_batch): print(link) final_link = link response = trafilatura.fetch_url(link,decode=False) + time.sleep(2) html = None if response is not None : good = True @@ -256,6 +260,8 @@ def extract_links(link_batch,responses,hostname,rules,default_status="frontlink" def index_links(db,extracted_links): linkcol=db["links"] for link,status in extracted_links: + if not is_link_good(link): + continue doc = get_link_doc(link,status) try: linkcol.insert_one(doc) @@ -264,63 +270,139 @@ def index_links(db,extracted_links): def get_link_features(link): a, urlpath = courlan.get_host_and_path(link) - features = urlpath.split("/?-_") - if len(features) < 2: - return None - # drop last part - features = features[:-1] - return features - - -def link_classifier(db,hostname,batch_size): - res = linkcol.aggregate([ - { "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } }, - { "$sample": { "size": 2000 } } - ]) - goodcounter = collections.Counter() - badcounter = collections.Counter() - for item in res: - link = res["url"] - state = res["status"] - cl = 0 - if state == "good": - cl = 1 - features = get_link_features(link) - if features is None: + features = re.split("[/?&]",urlpath) + #features = re.split("[/?-_=]",urlpath) + res = [] + for feature in features: + if len(feature) < 1: continue - lf = len(features) - for feature in features: - if state == "good": - goodcounter[feature] += 1/lf - else: - badcounter[feature] += 1/lf - tf = goodcounter.keys() + bacounter.keys() - allcounter = collections.Counter() - for key in tf: - gc = goodcounter[key] - bc = badcounter[key] - p = gc / (gc + bc) - allcounter[key] = p - return allcounter + if feature.isdigit(): + feature = "" + res.append(feature) + if len(res) < 2: + return None + res = res[:-1] + print(res) + return res +class LinkClassifier: + def __init__(self): + + self.goodcounter = collections.Counter() + self.badcounter = collections.Counter() + self.good_count = 0 + self.bad_count = 0 + self.alpha = 0.001 + + def train(self,db,hostname): + linkcol = db["links"] + res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink","backlink"]}}}) + testset = [] + for i,item in enumerate(res): + link = item["url"] + state = item["status"] + cl = 0 + if state == "good": + cl = 1 + print(cl,state,link) + if i % 10 == 1: + testset.append((link,cl)) + continue + features = get_link_features(link) + if features is None: + continue + lf = len(features) + if state == "good": + for feature in features: + self.good_count += 1 + self.goodcounter[feature] += 1 + else: + for feature in features: + self.bad_count += 1 + self.badcounter[feature] += 1 + self.bdictsize = len(self.badcounter) + self.gdictsize = len(self.goodcounter) + # eval + gg = 0 + for l,cl in testset: + pcp = self.classify(l) + r = 0 + if pcp > 0: + r = 1 + if r == cl: + gg += 1 + else: + print("MISS",l,cl,pcp) + print("Accuracy:") + print(len(testset)) + print(gg / len(testset)) + + def classify(self,link): + features = get_link_features(link) + res = 0 + gp = math.log(self.good_count) - math.log(self.good_count + self.bad_count) + bp = math.log(self.bad_count) - math.log(self.good_count + self.bad_count) + if features is None: + return math.exp(gp) - math.exp(bp) + gcc = math.log(self.gdictsize * self.alpha + self.good_count) + bcc = math.log(self.bdictsize * self.alpha + self.bad_count) + goodprob = 0 + badprob = 0 + for feature in features: + g = math.log((self.goodcounter[feature] + self.alpha)) - gcc + goodprob += g + b = math.log(self.badcounter[feature] + self.alpha) - bcc + badprob += b + print(feature,g,b) + if (goodprob + gp) > (badprob + bp): + #if goodprob > badprob: + res = 1 + pa = math.exp(goodprob + gp) + pb = math.exp(badprob + bp) + return pa - pb def get_links(db,hostname,status,batch_size): linkcol = db["links"] - #res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size) - # get random links + # count downloaded links res = linkcol.aggregate([ - { "$match": { "status": status,"host":hostname } }, - { "$sample": { "size": batch_size } } + { "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } }, + {"$group":{"_id":None, + "count":{"$count":{}}, + } + }, ]) links = set() - for i,doc in enumerate(res): - #print(">>>>>" + status) - #print(doc); - links.add(doc["url"]) - if i >= batch_size: - break + if list(res)[0]["count"] < 200: + #res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size) + # get random links + res = linkcol.aggregate([ + { "$match": { "status": status,"host":hostname } }, + { "$sample": { "size": batch_size } } + ]) + for i,doc in enumerate(res): + #print(">>>>>" + status) + #print(doc); + links.add(doc["url"]) + if i >= batch_size: + break + else: + cl = LinkClassifier() + cl.train(db,hostname) + res = linkcol.aggregate([ + { "$match": { "status": status,"host":hostname } }, + { "$sample": { "size": 2000 } } + ]) + outlinks = [] + for i,doc in enumerate(res): + #print(">>>>>" + status) + #print(doc); + link = doc["url"] + outlinks.append((doc["url"],cl.classify(link))) + outlinks = sorted(outlinks, key=lambda x: x[1],reverse=True) + links = [l[0] for l in outlinks[0:batch_size]] + # todo remove very bad links return list(links) @@ -427,6 +509,14 @@ def externaldomains(link): for d in domains: print(d) +@cli.command() +@click.argument("start_link") +def classify(start_link): + myclient = pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] + start_link,hostname = courlan.check_url(start_link) + cl = LinkClassifier() + cl.train(db,hostname) @cli.command() @click.argument("start_link")