diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 62b0409..d10808f 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -184,7 +184,6 @@ def index_pages(db,hostname,extracted_pages): doc["paragraph_checksums"] = checksums doc["paragraph_sizes"] = sizes goodsz = sum(sizes) - doc["paragraph_sizes_sum"] = goodsz # Not enough larger paragraphs if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO: state = "trash" @@ -263,9 +262,54 @@ def index_links(db,extracted_links): except pymongo.errors.DuplicateKeyError as ex: pass +def get_link_features(link): + a, urlpath = courlan.get_host_and_path(link) + features = urlpath.split("/?-_") + if len(features) < 2: + return None + # drop last part + features = features[:-1] + return features + + +def link_classifier(db,hostname,batch_size): + res = linkcol.aggregate([ + { "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } }, + { "$sample": { "size": 2000 } } + ]) + goodcounter = collections.Counter() + badcounter = collections.Counter() + for item in res: + link = res["url"] + state = res["status"] + cl = 0 + if state == "good": + cl = 1 + features = get_link_features(link) + if features is None: + continue + lf = len(features) + for feature in features: + if state == "good": + goodcounter[feature] += 1/lf + else: + badcounter[feature] += 1/lf + tf = goodcounter.keys() + bacounter.keys() + allcounter = collections.Counter() + for key in tf: + gc = goodcounter[key] + bc = badcounter[key] + p = gc / (gc + bc) + allcounter[key] = p + return allcounter + + + + def get_links(db,hostname,status,batch_size): linkcol = db["links"] #res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size) + # get random links res = linkcol.aggregate([ { "$match": { "status": status,"host":hostname } }, { "$sample": { "size": batch_size } } @@ -317,13 +361,22 @@ def link_summary(db,hostname): #{"$project": {"textsum":{"$sum":"$text_size"}}} {"$group":{"_id":None, "text_size_sum":{"$sum":"$text_size"}, - "paragraph_size_sum":{"$sum":"$paragraph_sizes_sum"} } }, ]) for item in res: print(item) +def domain_summary(db,hostname): + linkcol = db["links"] + #res = linkcol.distinct("hostname",{"hostname":hostname}) + + # count links + res = linkcol.aggregate([ + {"$group":{"_id":"$hostname","text_size_sum":{"$sum":"$text_size"}}}, + ]) + for item in res: + print(item) @click.group() def cli():