Compare commits
2 Commits
9a9e8da4cf
...
ddbe79848d
Author | SHA1 | Date | |
---|---|---|---|
ddbe79848d | |||
2de0da85a6 |
7
mongo/Dockerfile
Normal file
7
mongo/Dockerfile
Normal file
@ -0,0 +1,7 @@
|
||||
FROM python:3.9
|
||||
RUN mkdir /app
|
||||
COPY requirements.txt /app
|
||||
RUN pip install -r /app/requirements.txt
|
||||
COPY *.py /app
|
||||
WORKDIR /app
|
||||
ENTRYPOINT ["python", "./mongocrawler.py"]
|
@ -278,8 +278,7 @@ def get_link_features(link):
|
||||
for i,feature in enumerate(features):
|
||||
if len(feature) < 1:
|
||||
continue
|
||||
if feature.isdigit():
|
||||
feature = "<NUM>"
|
||||
feature = re.sub("[0-9]","*",feature)
|
||||
res.append(str(i)+ "-" + feature)
|
||||
if len(res) < 2:
|
||||
return None
|
||||
@ -322,6 +321,9 @@ class LinkClassifier:
|
||||
def test(self,testset):
|
||||
# eval
|
||||
gg = 0
|
||||
true_positive = 0
|
||||
positive = 0
|
||||
false_negative = 0
|
||||
for item in testset:
|
||||
l = item["url"]
|
||||
cl = 0
|
||||
@ -331,12 +333,19 @@ class LinkClassifier:
|
||||
r = 0
|
||||
if pcp > 0:
|
||||
r = 1
|
||||
if cl == 1:
|
||||
if r == 1:
|
||||
true_positive += 1
|
||||
positive += 1
|
||||
if r == 1 and cl == 0:
|
||||
false_negative += 1
|
||||
if r == cl:
|
||||
gg += 1
|
||||
else:
|
||||
print("MISS",l,cl,pcp)
|
||||
print("Accuracy:")
|
||||
print(len(testset))
|
||||
print("Precision: {}, Recall: {}".format(true_positive/positive,true_positive/(true_positive+false_negative)))
|
||||
print("Accuracy:")
|
||||
acc = gg / len(testset)
|
||||
print(acc)
|
||||
return acc
|
||||
@ -407,6 +416,7 @@ def link_summary(db,hostname):
|
||||
goodcount = 0
|
||||
info = {}
|
||||
crawled_count = 0
|
||||
bad_crawl_count = 0
|
||||
for item in res:
|
||||
count = item["count"]
|
||||
st = item["_id"]
|
||||
@ -415,7 +425,11 @@ def link_summary(db,hostname):
|
||||
goodcount += count
|
||||
if st != "frontlink" and st != "backlink":
|
||||
crawled_count += count
|
||||
if st != "good":
|
||||
bad_crawl_count += count
|
||||
info[st] = count
|
||||
info["crawled_count"] = crawled_count
|
||||
info["bad_crawl_count"] = bad_crawl_count
|
||||
baclink_cout = 0
|
||||
if "backlink" in info:
|
||||
backlink_count = info["backlink"]
|
||||
@ -469,7 +483,7 @@ def sample_links(db,hostname,status,batch_size):
|
||||
for item in res:
|
||||
for item in res:
|
||||
cll = cl.classify(item["url"])
|
||||
cll += random.uniform(-0.1,0.1)
|
||||
#cll += random.uniform(-0.1,0.1)
|
||||
sample_links.append((item["url"],cll))
|
||||
if cll > 0:
|
||||
predicted_good += 1
|
||||
@ -521,6 +535,7 @@ def createdb():
|
||||
htmlcol.create_index("html_md5",unique=True)
|
||||
domaincol = db["domains"]
|
||||
domaincol.create_index("host",unique=True)
|
||||
domaincol.create_index("average_fetch_characters",unique=True)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("link")
|
5
mongo/requirements.txt
Normal file
5
mongo/requirements.txt
Normal file
@ -0,0 +1,5 @@
|
||||
trafilatura
|
||||
courlan
|
||||
pymongo
|
||||
click
|
||||
lxml
|
Loading…
Reference in New Issue
Block a user