z
This commit is contained in:
parent
69236bb58d
commit
22ae282674
@ -374,7 +374,10 @@ def get_links(db,hostname,status,batch_size):
|
||||
},
|
||||
])
|
||||
links = set()
|
||||
if list(res)[0]["count"] < 200:
|
||||
out = list(res)
|
||||
if len(out) == 0:
|
||||
return list()
|
||||
if out[0]["count"] < 200:
|
||||
#res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
|
||||
# get random links
|
||||
res = linkcol.aggregate([
|
||||
@ -392,7 +395,7 @@ def get_links(db,hostname,status,batch_size):
|
||||
cl.train(db,hostname)
|
||||
res = linkcol.aggregate([
|
||||
{ "$match": { "status": status,"host":hostname } },
|
||||
{ "$sample": { "size": 2000 } }
|
||||
{ "$sample": { "size": batch_size * 100 } }
|
||||
])
|
||||
outlinks = []
|
||||
for i,doc in enumerate(res):
|
||||
@ -434,8 +437,19 @@ def link_summary(db,hostname):
|
||||
{"$match":{"host":hostname}},
|
||||
{"$group":{"_id":"$status","count":{"$sum":1}}},
|
||||
])
|
||||
badcount = 0
|
||||
goodcount = 0
|
||||
out = ["good","frontlink","backlink"]
|
||||
info = {}
|
||||
for item in res:
|
||||
print(item)
|
||||
if item["_id"] not in out:
|
||||
badcount += item["count"]
|
||||
if item["_id"] == "good":
|
||||
goodcount = item["count"]
|
||||
info[item["_id"]] = item["count"]
|
||||
good_prob = goodcount / (goodcount + badcount)
|
||||
info["good_prob"] = good_prob
|
||||
info["bad_documents"] = badcount
|
||||
print(">>>Domain Content")
|
||||
contentcol = db["content"]
|
||||
res = contentcol.aggregate([
|
||||
@ -446,8 +460,17 @@ def link_summary(db,hostname):
|
||||
}
|
||||
},
|
||||
])
|
||||
text_size = 0
|
||||
for item in res:
|
||||
print(item)
|
||||
text_size = item["text_size_sum"]
|
||||
good_document_characters = text_size / goodcount
|
||||
fetch_average_characters = text_size / (goodcount + badcount)
|
||||
info["total_good_characters"] = text_size
|
||||
info["average_good_characters"] = good_document_characters
|
||||
info["average_fetch_characters"] = fetch_average_characters
|
||||
domaincol = db["domain"]
|
||||
print(json.dumps(info))
|
||||
domaincol.update_one({"host":domain},{"$set":info},usert=True)
|
||||
|
||||
def domain_summary(db,hostname):
|
||||
linkcol = db["links"]
|
||||
@ -477,6 +500,8 @@ def createdb():
|
||||
contentcol.create_index("host")
|
||||
htmlcol = db["html"]
|
||||
htmlcol.create_index("url",unique=True)
|
||||
domaincol = db["domains"]
|
||||
domaincol.create_index("host",unique=True)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("link")
|
||||
@ -533,15 +558,19 @@ def visit(start_link):
|
||||
print("Fetching sitemap links")
|
||||
sitemap_links = fetch_sitemap_links(start_link)
|
||||
index_links(db,sitemap_links)
|
||||
links.append(start_link)
|
||||
links = get_links(db,hostname,"frontlink",batch_size)
|
||||
links.insert(0,start_link)
|
||||
if len(links) < batch_size:
|
||||
back_links = get_links(db,hostname,"backlink",batch_size - len(links))
|
||||
links += back_links
|
||||
|
||||
print("Processing frontlinks")
|
||||
print("Processing links")
|
||||
rules = fetch_robot(hostname)
|
||||
process_links(db,hostname,"frontlink",links,rules)
|
||||
print("Getting backlinks")
|
||||
back_links = get_links(db,hostname,"backlink",batch_size)
|
||||
print("Processing backlinks")
|
||||
process_links(db,hostname,"backlink",back_links,rules=rules)
|
||||
responses = fetch_pages(links)
|
||||
extracted_pages = extract_pages(links,responses)
|
||||
extracted_links = extract_links(links,responses,hostname,rules,"backlink")
|
||||
index_links(db,extracted_links)
|
||||
index_pages(db,hostname,extracted_pages)
|
||||
link_summary(db,hostname)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Loading…
Reference in New Issue
Block a user