z
This commit is contained in:
parent
69236bb58d
commit
22ae282674
@ -374,7 +374,10 @@ def get_links(db,hostname,status,batch_size):
|
|||||||
},
|
},
|
||||||
])
|
])
|
||||||
links = set()
|
links = set()
|
||||||
if list(res)[0]["count"] < 200:
|
out = list(res)
|
||||||
|
if len(out) == 0:
|
||||||
|
return list()
|
||||||
|
if out[0]["count"] < 200:
|
||||||
#res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
|
#res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
|
||||||
# get random links
|
# get random links
|
||||||
res = linkcol.aggregate([
|
res = linkcol.aggregate([
|
||||||
@ -392,7 +395,7 @@ def get_links(db,hostname,status,batch_size):
|
|||||||
cl.train(db,hostname)
|
cl.train(db,hostname)
|
||||||
res = linkcol.aggregate([
|
res = linkcol.aggregate([
|
||||||
{ "$match": { "status": status,"host":hostname } },
|
{ "$match": { "status": status,"host":hostname } },
|
||||||
{ "$sample": { "size": 2000 } }
|
{ "$sample": { "size": batch_size * 100 } }
|
||||||
])
|
])
|
||||||
outlinks = []
|
outlinks = []
|
||||||
for i,doc in enumerate(res):
|
for i,doc in enumerate(res):
|
||||||
@ -434,8 +437,19 @@ def link_summary(db,hostname):
|
|||||||
{"$match":{"host":hostname}},
|
{"$match":{"host":hostname}},
|
||||||
{"$group":{"_id":"$status","count":{"$sum":1}}},
|
{"$group":{"_id":"$status","count":{"$sum":1}}},
|
||||||
])
|
])
|
||||||
|
badcount = 0
|
||||||
|
goodcount = 0
|
||||||
|
out = ["good","frontlink","backlink"]
|
||||||
|
info = {}
|
||||||
for item in res:
|
for item in res:
|
||||||
print(item)
|
if item["_id"] not in out:
|
||||||
|
badcount += item["count"]
|
||||||
|
if item["_id"] == "good":
|
||||||
|
goodcount = item["count"]
|
||||||
|
info[item["_id"]] = item["count"]
|
||||||
|
good_prob = goodcount / (goodcount + badcount)
|
||||||
|
info["good_prob"] = good_prob
|
||||||
|
info["bad_documents"] = badcount
|
||||||
print(">>>Domain Content")
|
print(">>>Domain Content")
|
||||||
contentcol = db["content"]
|
contentcol = db["content"]
|
||||||
res = contentcol.aggregate([
|
res = contentcol.aggregate([
|
||||||
@ -446,8 +460,17 @@ def link_summary(db,hostname):
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
])
|
])
|
||||||
|
text_size = 0
|
||||||
for item in res:
|
for item in res:
|
||||||
print(item)
|
text_size = item["text_size_sum"]
|
||||||
|
good_document_characters = text_size / goodcount
|
||||||
|
fetch_average_characters = text_size / (goodcount + badcount)
|
||||||
|
info["total_good_characters"] = text_size
|
||||||
|
info["average_good_characters"] = good_document_characters
|
||||||
|
info["average_fetch_characters"] = fetch_average_characters
|
||||||
|
domaincol = db["domain"]
|
||||||
|
print(json.dumps(info))
|
||||||
|
domaincol.update_one({"host":domain},{"$set":info},usert=True)
|
||||||
|
|
||||||
def domain_summary(db,hostname):
|
def domain_summary(db,hostname):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
@ -477,6 +500,8 @@ def createdb():
|
|||||||
contentcol.create_index("host")
|
contentcol.create_index("host")
|
||||||
htmlcol = db["html"]
|
htmlcol = db["html"]
|
||||||
htmlcol.create_index("url",unique=True)
|
htmlcol.create_index("url",unique=True)
|
||||||
|
domaincol = db["domains"]
|
||||||
|
domaincol.create_index("host",unique=True)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("link")
|
@click.argument("link")
|
||||||
@ -533,15 +558,19 @@ def visit(start_link):
|
|||||||
print("Fetching sitemap links")
|
print("Fetching sitemap links")
|
||||||
sitemap_links = fetch_sitemap_links(start_link)
|
sitemap_links = fetch_sitemap_links(start_link)
|
||||||
index_links(db,sitemap_links)
|
index_links(db,sitemap_links)
|
||||||
links.append(start_link)
|
links = get_links(db,hostname,"frontlink",batch_size)
|
||||||
|
links.insert(0,start_link)
|
||||||
|
if len(links) < batch_size:
|
||||||
|
back_links = get_links(db,hostname,"backlink",batch_size - len(links))
|
||||||
|
links += back_links
|
||||||
|
|
||||||
print("Processing frontlinks")
|
print("Processing links")
|
||||||
rules = fetch_robot(hostname)
|
rules = fetch_robot(hostname)
|
||||||
process_links(db,hostname,"frontlink",links,rules)
|
responses = fetch_pages(links)
|
||||||
print("Getting backlinks")
|
extracted_pages = extract_pages(links,responses)
|
||||||
back_links = get_links(db,hostname,"backlink",batch_size)
|
extracted_links = extract_links(links,responses,hostname,rules,"backlink")
|
||||||
print("Processing backlinks")
|
index_links(db,extracted_links)
|
||||||
process_links(db,hostname,"backlink",back_links,rules=rules)
|
index_pages(db,hostname,extracted_pages)
|
||||||
link_summary(db,hostname)
|
link_summary(db,hostname)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
Loading…
Reference in New Issue
Block a user