Compare commits

...

2 Commits

Author SHA1 Message Date
ddbe79848d zz 2023-04-06 13:21:34 +02:00
2de0da85a6 zz 2023-04-06 12:26:50 +02:00
3 changed files with 31 additions and 4 deletions

7
mongo/Dockerfile Normal file
View File

@ -0,0 +1,7 @@
FROM python:3.9
RUN mkdir /app
COPY requirements.txt /app
RUN pip install -r /app/requirements.txt
COPY *.py /app
WORKDIR /app
ENTRYPOINT ["python", "./mongocrawler.py"]

View File

@ -278,8 +278,7 @@ def get_link_features(link):
for i,feature in enumerate(features):
if len(feature) < 1:
continue
if feature.isdigit():
feature = "<NUM>"
feature = re.sub("[0-9]","*",feature)
res.append(str(i)+ "-" + feature)
if len(res) < 2:
return None
@ -322,6 +321,9 @@ class LinkClassifier:
def test(self,testset):
# eval
gg = 0
true_positive = 0
positive = 0
false_negative = 0
for item in testset:
l = item["url"]
cl = 0
@ -331,12 +333,19 @@ class LinkClassifier:
r = 0
if pcp > 0:
r = 1
if cl == 1:
if r == 1:
true_positive += 1
positive += 1
if r == 1 and cl == 0:
false_negative += 1
if r == cl:
gg += 1
else:
print("MISS",l,cl,pcp)
print("Accuracy:")
print(len(testset))
print("Precision: {}, Recall: {}".format(true_positive/positive,true_positive/(true_positive+false_negative)))
print("Accuracy:")
acc = gg / len(testset)
print(acc)
return acc
@ -407,6 +416,7 @@ def link_summary(db,hostname):
goodcount = 0
info = {}
crawled_count = 0
bad_crawl_count = 0
for item in res:
count = item["count"]
st = item["_id"]
@ -415,7 +425,11 @@ def link_summary(db,hostname):
goodcount += count
if st != "frontlink" and st != "backlink":
crawled_count += count
if st != "good":
bad_crawl_count += count
info[st] = count
info["crawled_count"] = crawled_count
info["bad_crawl_count"] = bad_crawl_count
baclink_cout = 0
if "backlink" in info:
backlink_count = info["backlink"]
@ -469,7 +483,7 @@ def sample_links(db,hostname,status,batch_size):
for item in res:
for item in res:
cll = cl.classify(item["url"])
cll += random.uniform(-0.1,0.1)
#cll += random.uniform(-0.1,0.1)
sample_links.append((item["url"],cll))
if cll > 0:
predicted_good += 1
@ -521,6 +535,7 @@ def createdb():
htmlcol.create_index("html_md5",unique=True)
domaincol = db["domains"]
domaincol.create_index("host",unique=True)
domaincol.create_index("average_fetch_characters",unique=True)
@cli.command()
@click.argument("link")

5
mongo/requirements.txt Normal file
View File

@ -0,0 +1,5 @@
trafilatura
courlan
pymongo
click
lxml