zz
This commit is contained in:
parent
6567de421c
commit
ab7ca1476f
@ -184,7 +184,6 @@ def index_pages(db,hostname,extracted_pages):
|
||||
doc["paragraph_checksums"] = checksums
|
||||
doc["paragraph_sizes"] = sizes
|
||||
goodsz = sum(sizes)
|
||||
doc["paragraph_sizes_sum"] = goodsz
|
||||
# Not enough larger paragraphs
|
||||
if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
|
||||
state = "trash"
|
||||
@ -263,9 +262,54 @@ def index_links(db,extracted_links):
|
||||
except pymongo.errors.DuplicateKeyError as ex:
|
||||
pass
|
||||
|
||||
def get_link_features(link):
|
||||
a, urlpath = courlan.get_host_and_path(link)
|
||||
features = urlpath.split("/?-_")
|
||||
if len(features) < 2:
|
||||
return None
|
||||
# drop last part
|
||||
features = features[:-1]
|
||||
return features
|
||||
|
||||
|
||||
def link_classifier(db,hostname,batch_size):
|
||||
res = linkcol.aggregate([
|
||||
{ "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } },
|
||||
{ "$sample": { "size": 2000 } }
|
||||
])
|
||||
goodcounter = collections.Counter()
|
||||
badcounter = collections.Counter()
|
||||
for item in res:
|
||||
link = res["url"]
|
||||
state = res["status"]
|
||||
cl = 0
|
||||
if state == "good":
|
||||
cl = 1
|
||||
features = get_link_features(link)
|
||||
if features is None:
|
||||
continue
|
||||
lf = len(features)
|
||||
for feature in features:
|
||||
if state == "good":
|
||||
goodcounter[feature] += 1/lf
|
||||
else:
|
||||
badcounter[feature] += 1/lf
|
||||
tf = goodcounter.keys() + bacounter.keys()
|
||||
allcounter = collections.Counter()
|
||||
for key in tf:
|
||||
gc = goodcounter[key]
|
||||
bc = badcounter[key]
|
||||
p = gc / (gc + bc)
|
||||
allcounter[key] = p
|
||||
return allcounter
|
||||
|
||||
|
||||
|
||||
|
||||
def get_links(db,hostname,status,batch_size):
|
||||
linkcol = db["links"]
|
||||
#res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
|
||||
# get random links
|
||||
res = linkcol.aggregate([
|
||||
{ "$match": { "status": status,"host":hostname } },
|
||||
{ "$sample": { "size": batch_size } }
|
||||
@ -317,13 +361,22 @@ def link_summary(db,hostname):
|
||||
#{"$project": {"textsum":{"$sum":"$text_size"}}}
|
||||
{"$group":{"_id":None,
|
||||
"text_size_sum":{"$sum":"$text_size"},
|
||||
"paragraph_size_sum":{"$sum":"$paragraph_sizes_sum"}
|
||||
}
|
||||
},
|
||||
])
|
||||
for item in res:
|
||||
print(item)
|
||||
|
||||
def domain_summary(db,hostname):
|
||||
linkcol = db["links"]
|
||||
#res = linkcol.distinct("hostname",{"hostname":hostname})
|
||||
|
||||
# count links
|
||||
res = linkcol.aggregate([
|
||||
{"$group":{"_id":"$hostname","text_size_sum":{"$sum":"$text_size"}}},
|
||||
])
|
||||
for item in res:
|
||||
print(item)
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
|
Loading…
Reference in New Issue
Block a user