Compare commits
2 Commits
9a9e8da4cf
...
ddbe79848d
Author | SHA1 | Date | |
---|---|---|---|
ddbe79848d | |||
2de0da85a6 |
7
mongo/Dockerfile
Normal file
7
mongo/Dockerfile
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
FROM python:3.9
|
||||||
|
RUN mkdir /app
|
||||||
|
COPY requirements.txt /app
|
||||||
|
RUN pip install -r /app/requirements.txt
|
||||||
|
COPY *.py /app
|
||||||
|
WORKDIR /app
|
||||||
|
ENTRYPOINT ["python", "./mongocrawler.py"]
|
@ -278,8 +278,7 @@ def get_link_features(link):
|
|||||||
for i,feature in enumerate(features):
|
for i,feature in enumerate(features):
|
||||||
if len(feature) < 1:
|
if len(feature) < 1:
|
||||||
continue
|
continue
|
||||||
if feature.isdigit():
|
feature = re.sub("[0-9]","*",feature)
|
||||||
feature = "<NUM>"
|
|
||||||
res.append(str(i)+ "-" + feature)
|
res.append(str(i)+ "-" + feature)
|
||||||
if len(res) < 2:
|
if len(res) < 2:
|
||||||
return None
|
return None
|
||||||
@ -322,6 +321,9 @@ class LinkClassifier:
|
|||||||
def test(self,testset):
|
def test(self,testset):
|
||||||
# eval
|
# eval
|
||||||
gg = 0
|
gg = 0
|
||||||
|
true_positive = 0
|
||||||
|
positive = 0
|
||||||
|
false_negative = 0
|
||||||
for item in testset:
|
for item in testset:
|
||||||
l = item["url"]
|
l = item["url"]
|
||||||
cl = 0
|
cl = 0
|
||||||
@ -331,12 +333,19 @@ class LinkClassifier:
|
|||||||
r = 0
|
r = 0
|
||||||
if pcp > 0:
|
if pcp > 0:
|
||||||
r = 1
|
r = 1
|
||||||
|
if cl == 1:
|
||||||
|
if r == 1:
|
||||||
|
true_positive += 1
|
||||||
|
positive += 1
|
||||||
|
if r == 1 and cl == 0:
|
||||||
|
false_negative += 1
|
||||||
if r == cl:
|
if r == cl:
|
||||||
gg += 1
|
gg += 1
|
||||||
else:
|
else:
|
||||||
print("MISS",l,cl,pcp)
|
print("MISS",l,cl,pcp)
|
||||||
print("Accuracy:")
|
|
||||||
print(len(testset))
|
print(len(testset))
|
||||||
|
print("Precision: {}, Recall: {}".format(true_positive/positive,true_positive/(true_positive+false_negative)))
|
||||||
|
print("Accuracy:")
|
||||||
acc = gg / len(testset)
|
acc = gg / len(testset)
|
||||||
print(acc)
|
print(acc)
|
||||||
return acc
|
return acc
|
||||||
@ -407,6 +416,7 @@ def link_summary(db,hostname):
|
|||||||
goodcount = 0
|
goodcount = 0
|
||||||
info = {}
|
info = {}
|
||||||
crawled_count = 0
|
crawled_count = 0
|
||||||
|
bad_crawl_count = 0
|
||||||
for item in res:
|
for item in res:
|
||||||
count = item["count"]
|
count = item["count"]
|
||||||
st = item["_id"]
|
st = item["_id"]
|
||||||
@ -415,7 +425,11 @@ def link_summary(db,hostname):
|
|||||||
goodcount += count
|
goodcount += count
|
||||||
if st != "frontlink" and st != "backlink":
|
if st != "frontlink" and st != "backlink":
|
||||||
crawled_count += count
|
crawled_count += count
|
||||||
|
if st != "good":
|
||||||
|
bad_crawl_count += count
|
||||||
info[st] = count
|
info[st] = count
|
||||||
|
info["crawled_count"] = crawled_count
|
||||||
|
info["bad_crawl_count"] = bad_crawl_count
|
||||||
baclink_cout = 0
|
baclink_cout = 0
|
||||||
if "backlink" in info:
|
if "backlink" in info:
|
||||||
backlink_count = info["backlink"]
|
backlink_count = info["backlink"]
|
||||||
@ -469,7 +483,7 @@ def sample_links(db,hostname,status,batch_size):
|
|||||||
for item in res:
|
for item in res:
|
||||||
for item in res:
|
for item in res:
|
||||||
cll = cl.classify(item["url"])
|
cll = cl.classify(item["url"])
|
||||||
cll += random.uniform(-0.1,0.1)
|
#cll += random.uniform(-0.1,0.1)
|
||||||
sample_links.append((item["url"],cll))
|
sample_links.append((item["url"],cll))
|
||||||
if cll > 0:
|
if cll > 0:
|
||||||
predicted_good += 1
|
predicted_good += 1
|
||||||
@ -521,6 +535,7 @@ def createdb():
|
|||||||
htmlcol.create_index("html_md5",unique=True)
|
htmlcol.create_index("html_md5",unique=True)
|
||||||
domaincol = db["domains"]
|
domaincol = db["domains"]
|
||||||
domaincol.create_index("host",unique=True)
|
domaincol.create_index("host",unique=True)
|
||||||
|
domaincol.create_index("average_fetch_characters",unique=True)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("link")
|
@click.argument("link")
|
5
mongo/requirements.txt
Normal file
5
mongo/requirements.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
trafilatura
|
||||||
|
courlan
|
||||||
|
pymongo
|
||||||
|
click
|
||||||
|
lxml
|
Loading…
Reference in New Issue
Block a user