diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 833ec86..008fb0d 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -374,7 +374,10 @@ def get_links(db,hostname,status,batch_size): }, ]) links = set() - if list(res)[0]["count"] < 200: + out = list(res) + if len(out) == 0: + return list() + if out[0]["count"] < 200: #res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size) # get random links res = linkcol.aggregate([ @@ -392,7 +395,7 @@ def get_links(db,hostname,status,batch_size): cl.train(db,hostname) res = linkcol.aggregate([ { "$match": { "status": status,"host":hostname } }, - { "$sample": { "size": 2000 } } + { "$sample": { "size": batch_size * 100 } } ]) outlinks = [] for i,doc in enumerate(res): @@ -434,8 +437,19 @@ def link_summary(db,hostname): {"$match":{"host":hostname}}, {"$group":{"_id":"$status","count":{"$sum":1}}}, ]) + badcount = 0 + goodcount = 0 + out = ["good","frontlink","backlink"] + info = {} for item in res: - print(item) + if item["_id"] not in out: + badcount += item["count"] + if item["_id"] == "good": + goodcount = item["count"] + info[item["_id"]] = item["count"] + good_prob = goodcount / (goodcount + badcount) + info["good_prob"] = good_prob + info["bad_documents"] = badcount print(">>>Domain Content") contentcol = db["content"] res = contentcol.aggregate([ @@ -446,8 +460,17 @@ def link_summary(db,hostname): } }, ]) + text_size = 0 for item in res: - print(item) + text_size = item["text_size_sum"] + good_document_characters = text_size / goodcount + fetch_average_characters = text_size / (goodcount + badcount) + info["total_good_characters"] = text_size + info["average_good_characters"] = good_document_characters + info["average_fetch_characters"] = fetch_average_characters + domaincol = db["domain"] + print(json.dumps(info)) + domaincol.update_one({"host":domain},{"$set":info},usert=True) def domain_summary(db,hostname): linkcol = db["links"] @@ -477,6 +500,8 @@ def createdb(): contentcol.create_index("host") htmlcol = db["html"] htmlcol.create_index("url",unique=True) + domaincol = db["domains"] + domaincol.create_index("host",unique=True) @cli.command() @click.argument("link") @@ -533,15 +558,19 @@ def visit(start_link): print("Fetching sitemap links") sitemap_links = fetch_sitemap_links(start_link) index_links(db,sitemap_links) - links.append(start_link) + links = get_links(db,hostname,"frontlink",batch_size) + links.insert(0,start_link) + if len(links) < batch_size: + back_links = get_links(db,hostname,"backlink",batch_size - len(links)) + links += back_links - print("Processing frontlinks") + print("Processing links") rules = fetch_robot(hostname) - process_links(db,hostname,"frontlink",links,rules) - print("Getting backlinks") - back_links = get_links(db,hostname,"backlink",batch_size) - print("Processing backlinks") - process_links(db,hostname,"backlink",back_links,rules=rules) + responses = fetch_pages(links) + extracted_pages = extract_pages(links,responses) + extracted_links = extract_links(links,responses,hostname,rules,"backlink") + index_links(db,extracted_links) + index_pages(db,hostname,extracted_pages) link_summary(db,hostname) if __name__ == "__main__":