This commit is contained in:
Daniel Hládek 2023-04-06 13:21:34 +02:00
parent 2de0da85a6
commit ddbe79848d
2 changed files with 21 additions and 5 deletions

View File

@ -1,6 +1,7 @@
FROM python:3.9 FROM python:3.9
RUN mkdir /app RUN mkdir /app
COPY requirements.txt /app COPY requirements.txt /app
RUN pip install -r requirements.txt RUN pip install -r /app/requirements.txt
COPY *.py /app COPY *.py /app
WORKDIR /app WORKDIR /app
ENTRYPOINT ["python", "./mongocrawler.py"]

View File

@ -278,8 +278,7 @@ def get_link_features(link):
for i,feature in enumerate(features): for i,feature in enumerate(features):
if len(feature) < 1: if len(feature) < 1:
continue continue
if feature.isdigit(): feature = re.sub("[0-9]","*",feature)
feature = "<NUM>"
res.append(str(i)+ "-" + feature) res.append(str(i)+ "-" + feature)
if len(res) < 2: if len(res) < 2:
return None return None
@ -322,6 +321,9 @@ class LinkClassifier:
def test(self,testset): def test(self,testset):
# eval # eval
gg = 0 gg = 0
true_positive = 0
positive = 0
false_negative = 0
for item in testset: for item in testset:
l = item["url"] l = item["url"]
cl = 0 cl = 0
@ -331,12 +333,19 @@ class LinkClassifier:
r = 0 r = 0
if pcp > 0: if pcp > 0:
r = 1 r = 1
if cl == 1:
if r == 1:
true_positive += 1
positive += 1
if r == 1 and cl == 0:
false_negative += 1
if r == cl: if r == cl:
gg += 1 gg += 1
else: else:
print("MISS",l,cl,pcp) print("MISS",l,cl,pcp)
print("Accuracy:")
print(len(testset)) print(len(testset))
print("Precision: {}, Recall: {}".format(true_positive/positive,true_positive/(true_positive+false_negative)))
print("Accuracy:")
acc = gg / len(testset) acc = gg / len(testset)
print(acc) print(acc)
return acc return acc
@ -407,6 +416,7 @@ def link_summary(db,hostname):
goodcount = 0 goodcount = 0
info = {} info = {}
crawled_count = 0 crawled_count = 0
bad_crawl_count = 0
for item in res: for item in res:
count = item["count"] count = item["count"]
st = item["_id"] st = item["_id"]
@ -415,7 +425,11 @@ def link_summary(db,hostname):
goodcount += count goodcount += count
if st != "frontlink" and st != "backlink": if st != "frontlink" and st != "backlink":
crawled_count += count crawled_count += count
if st != "good":
bad_crawl_count += count
info[st] = count info[st] = count
info["crawled_count"] = crawled_count
info["bad_crawl_count"] = bad_crawl_count
baclink_cout = 0 baclink_cout = 0
if "backlink" in info: if "backlink" in info:
backlink_count = info["backlink"] backlink_count = info["backlink"]
@ -469,7 +483,7 @@ def sample_links(db,hostname,status,batch_size):
for item in res: for item in res:
for item in res: for item in res:
cll = cl.classify(item["url"]) cll = cl.classify(item["url"])
cll += random.uniform(-0.1,0.1) #cll += random.uniform(-0.1,0.1)
sample_links.append((item["url"],cll)) sample_links.append((item["url"],cll))
if cll > 0: if cll > 0:
predicted_good += 1 predicted_good += 1
@ -521,6 +535,7 @@ def createdb():
htmlcol.create_index("html_md5",unique=True) htmlcol.create_index("html_md5",unique=True)
domaincol = db["domains"] domaincol = db["domains"]
domaincol.create_index("host",unique=True) domaincol.create_index("host",unique=True)
domaincol.create_index("average_fetch_characters",unique=True)
@cli.command() @cli.command()
@click.argument("link") @click.argument("link")