zz
This commit is contained in:
parent
6567de421c
commit
ab7ca1476f
@ -184,7 +184,6 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
doc["paragraph_checksums"] = checksums
|
doc["paragraph_checksums"] = checksums
|
||||||
doc["paragraph_sizes"] = sizes
|
doc["paragraph_sizes"] = sizes
|
||||||
goodsz = sum(sizes)
|
goodsz = sum(sizes)
|
||||||
doc["paragraph_sizes_sum"] = goodsz
|
|
||||||
# Not enough larger paragraphs
|
# Not enough larger paragraphs
|
||||||
if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
|
if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
|
||||||
state = "trash"
|
state = "trash"
|
||||||
@ -263,9 +262,54 @@ def index_links(db,extracted_links):
|
|||||||
except pymongo.errors.DuplicateKeyError as ex:
|
except pymongo.errors.DuplicateKeyError as ex:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def get_link_features(link):
|
||||||
|
a, urlpath = courlan.get_host_and_path(link)
|
||||||
|
features = urlpath.split("/?-_")
|
||||||
|
if len(features) < 2:
|
||||||
|
return None
|
||||||
|
# drop last part
|
||||||
|
features = features[:-1]
|
||||||
|
return features
|
||||||
|
|
||||||
|
|
||||||
|
def link_classifier(db,hostname,batch_size):
|
||||||
|
res = linkcol.aggregate([
|
||||||
|
{ "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } },
|
||||||
|
{ "$sample": { "size": 2000 } }
|
||||||
|
])
|
||||||
|
goodcounter = collections.Counter()
|
||||||
|
badcounter = collections.Counter()
|
||||||
|
for item in res:
|
||||||
|
link = res["url"]
|
||||||
|
state = res["status"]
|
||||||
|
cl = 0
|
||||||
|
if state == "good":
|
||||||
|
cl = 1
|
||||||
|
features = get_link_features(link)
|
||||||
|
if features is None:
|
||||||
|
continue
|
||||||
|
lf = len(features)
|
||||||
|
for feature in features:
|
||||||
|
if state == "good":
|
||||||
|
goodcounter[feature] += 1/lf
|
||||||
|
else:
|
||||||
|
badcounter[feature] += 1/lf
|
||||||
|
tf = goodcounter.keys() + bacounter.keys()
|
||||||
|
allcounter = collections.Counter()
|
||||||
|
for key in tf:
|
||||||
|
gc = goodcounter[key]
|
||||||
|
bc = badcounter[key]
|
||||||
|
p = gc / (gc + bc)
|
||||||
|
allcounter[key] = p
|
||||||
|
return allcounter
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_links(db,hostname,status,batch_size):
|
def get_links(db,hostname,status,batch_size):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
#res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
|
#res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
|
||||||
|
# get random links
|
||||||
res = linkcol.aggregate([
|
res = linkcol.aggregate([
|
||||||
{ "$match": { "status": status,"host":hostname } },
|
{ "$match": { "status": status,"host":hostname } },
|
||||||
{ "$sample": { "size": batch_size } }
|
{ "$sample": { "size": batch_size } }
|
||||||
@ -317,13 +361,22 @@ def link_summary(db,hostname):
|
|||||||
#{"$project": {"textsum":{"$sum":"$text_size"}}}
|
#{"$project": {"textsum":{"$sum":"$text_size"}}}
|
||||||
{"$group":{"_id":None,
|
{"$group":{"_id":None,
|
||||||
"text_size_sum":{"$sum":"$text_size"},
|
"text_size_sum":{"$sum":"$text_size"},
|
||||||
"paragraph_size_sum":{"$sum":"$paragraph_sizes_sum"}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
])
|
])
|
||||||
for item in res:
|
for item in res:
|
||||||
print(item)
|
print(item)
|
||||||
|
|
||||||
|
def domain_summary(db,hostname):
|
||||||
|
linkcol = db["links"]
|
||||||
|
#res = linkcol.distinct("hostname",{"hostname":hostname})
|
||||||
|
|
||||||
|
# count links
|
||||||
|
res = linkcol.aggregate([
|
||||||
|
{"$group":{"_id":"$hostname","text_size_sum":{"$sum":"$text_size"}}},
|
||||||
|
])
|
||||||
|
for item in res:
|
||||||
|
print(item)
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
def cli():
|
def cli():
|
||||||
|
Loading…
Reference in New Issue
Block a user