This commit is contained in:
Daniel Hládek 2023-04-03 09:39:10 +02:00
parent 6567de421c
commit ab7ca1476f

View File

@ -184,7 +184,6 @@ def index_pages(db,hostname,extracted_pages):
doc["paragraph_checksums"] = checksums
doc["paragraph_sizes"] = sizes
goodsz = sum(sizes)
doc["paragraph_sizes_sum"] = goodsz
# Not enough larger paragraphs
if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
state = "trash"
@ -263,9 +262,54 @@ def index_links(db,extracted_links):
except pymongo.errors.DuplicateKeyError as ex:
pass
def get_link_features(link):
a, urlpath = courlan.get_host_and_path(link)
features = urlpath.split("/?-_")
if len(features) < 2:
return None
# drop last part
features = features[:-1]
return features
def link_classifier(db,hostname,batch_size):
res = linkcol.aggregate([
{ "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } },
{ "$sample": { "size": 2000 } }
])
goodcounter = collections.Counter()
badcounter = collections.Counter()
for item in res:
link = res["url"]
state = res["status"]
cl = 0
if state == "good":
cl = 1
features = get_link_features(link)
if features is None:
continue
lf = len(features)
for feature in features:
if state == "good":
goodcounter[feature] += 1/lf
else:
badcounter[feature] += 1/lf
tf = goodcounter.keys() + bacounter.keys()
allcounter = collections.Counter()
for key in tf:
gc = goodcounter[key]
bc = badcounter[key]
p = gc / (gc + bc)
allcounter[key] = p
return allcounter
def get_links(db,hostname,status,batch_size):
linkcol = db["links"]
#res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
# get random links
res = linkcol.aggregate([
{ "$match": { "status": status,"host":hostname } },
{ "$sample": { "size": batch_size } }
@ -317,13 +361,22 @@ def link_summary(db,hostname):
#{"$project": {"textsum":{"$sum":"$text_size"}}}
{"$group":{"_id":None,
"text_size_sum":{"$sum":"$text_size"},
"paragraph_size_sum":{"$sum":"$paragraph_sizes_sum"}
}
},
])
for item in res:
print(item)
def domain_summary(db,hostname):
linkcol = db["links"]
#res = linkcol.distinct("hostname",{"hostname":hostname})
# count links
res = linkcol.aggregate([
{"$group":{"_id":"$hostname","text_size_sum":{"$sum":"$text_size"}}},
])
for item in res:
print(item)
@click.group()
def cli():