zz
This commit is contained in:
parent
2de0da85a6
commit
ddbe79848d
@ -1,6 +1,7 @@
|
||||
FROM python:3.9
|
||||
RUN mkdir /app
|
||||
COPY requirements.txt /app
|
||||
RUN pip install -r requirements.txt
|
||||
RUN pip install -r /app/requirements.txt
|
||||
COPY *.py /app
|
||||
WORKDIR /app
|
||||
ENTRYPOINT ["python", "./mongocrawler.py"]
|
||||
|
@ -278,8 +278,7 @@ def get_link_features(link):
|
||||
for i,feature in enumerate(features):
|
||||
if len(feature) < 1:
|
||||
continue
|
||||
if feature.isdigit():
|
||||
feature = "<NUM>"
|
||||
feature = re.sub("[0-9]","*",feature)
|
||||
res.append(str(i)+ "-" + feature)
|
||||
if len(res) < 2:
|
||||
return None
|
||||
@ -322,6 +321,9 @@ class LinkClassifier:
|
||||
def test(self,testset):
|
||||
# eval
|
||||
gg = 0
|
||||
true_positive = 0
|
||||
positive = 0
|
||||
false_negative = 0
|
||||
for item in testset:
|
||||
l = item["url"]
|
||||
cl = 0
|
||||
@ -331,12 +333,19 @@ class LinkClassifier:
|
||||
r = 0
|
||||
if pcp > 0:
|
||||
r = 1
|
||||
if cl == 1:
|
||||
if r == 1:
|
||||
true_positive += 1
|
||||
positive += 1
|
||||
if r == 1 and cl == 0:
|
||||
false_negative += 1
|
||||
if r == cl:
|
||||
gg += 1
|
||||
else:
|
||||
print("MISS",l,cl,pcp)
|
||||
print("Accuracy:")
|
||||
print(len(testset))
|
||||
print("Precision: {}, Recall: {}".format(true_positive/positive,true_positive/(true_positive+false_negative)))
|
||||
print("Accuracy:")
|
||||
acc = gg / len(testset)
|
||||
print(acc)
|
||||
return acc
|
||||
@ -407,6 +416,7 @@ def link_summary(db,hostname):
|
||||
goodcount = 0
|
||||
info = {}
|
||||
crawled_count = 0
|
||||
bad_crawl_count = 0
|
||||
for item in res:
|
||||
count = item["count"]
|
||||
st = item["_id"]
|
||||
@ -415,7 +425,11 @@ def link_summary(db,hostname):
|
||||
goodcount += count
|
||||
if st != "frontlink" and st != "backlink":
|
||||
crawled_count += count
|
||||
if st != "good":
|
||||
bad_crawl_count += count
|
||||
info[st] = count
|
||||
info["crawled_count"] = crawled_count
|
||||
info["bad_crawl_count"] = bad_crawl_count
|
||||
baclink_cout = 0
|
||||
if "backlink" in info:
|
||||
backlink_count = info["backlink"]
|
||||
@ -469,7 +483,7 @@ def sample_links(db,hostname,status,batch_size):
|
||||
for item in res:
|
||||
for item in res:
|
||||
cll = cl.classify(item["url"])
|
||||
cll += random.uniform(-0.1,0.1)
|
||||
#cll += random.uniform(-0.1,0.1)
|
||||
sample_links.append((item["url"],cll))
|
||||
if cll > 0:
|
||||
predicted_good += 1
|
||||
@ -521,6 +535,7 @@ def createdb():
|
||||
htmlcol.create_index("html_md5",unique=True)
|
||||
domaincol = db["domains"]
|
||||
domaincol.create_index("host",unique=True)
|
||||
domaincol.create_index("average_fetch_characters",unique=True)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("link")
|
Loading…
Reference in New Issue
Block a user