Compare commits
No commits in common. "ddbe79848d5225a5093e36cf8c8a5faddbcb4063" and "9a9e8da4cfbe165d961146ba13d65c152dbbc57e" have entirely different histories.
ddbe79848d
...
9a9e8da4cf
@ -1,7 +0,0 @@
|
|||||||
FROM python:3.9
|
|
||||||
RUN mkdir /app
|
|
||||||
COPY requirements.txt /app
|
|
||||||
RUN pip install -r /app/requirements.txt
|
|
||||||
COPY *.py /app
|
|
||||||
WORKDIR /app
|
|
||||||
ENTRYPOINT ["python", "./mongocrawler.py"]
|
|
@ -278,7 +278,8 @@ def get_link_features(link):
|
|||||||
for i,feature in enumerate(features):
|
for i,feature in enumerate(features):
|
||||||
if len(feature) < 1:
|
if len(feature) < 1:
|
||||||
continue
|
continue
|
||||||
feature = re.sub("[0-9]","*",feature)
|
if feature.isdigit():
|
||||||
|
feature = "<NUM>"
|
||||||
res.append(str(i)+ "-" + feature)
|
res.append(str(i)+ "-" + feature)
|
||||||
if len(res) < 2:
|
if len(res) < 2:
|
||||||
return None
|
return None
|
||||||
@ -321,9 +322,6 @@ class LinkClassifier:
|
|||||||
def test(self,testset):
|
def test(self,testset):
|
||||||
# eval
|
# eval
|
||||||
gg = 0
|
gg = 0
|
||||||
true_positive = 0
|
|
||||||
positive = 0
|
|
||||||
false_negative = 0
|
|
||||||
for item in testset:
|
for item in testset:
|
||||||
l = item["url"]
|
l = item["url"]
|
||||||
cl = 0
|
cl = 0
|
||||||
@ -333,19 +331,12 @@ class LinkClassifier:
|
|||||||
r = 0
|
r = 0
|
||||||
if pcp > 0:
|
if pcp > 0:
|
||||||
r = 1
|
r = 1
|
||||||
if cl == 1:
|
|
||||||
if r == 1:
|
|
||||||
true_positive += 1
|
|
||||||
positive += 1
|
|
||||||
if r == 1 and cl == 0:
|
|
||||||
false_negative += 1
|
|
||||||
if r == cl:
|
if r == cl:
|
||||||
gg += 1
|
gg += 1
|
||||||
else:
|
else:
|
||||||
print("MISS",l,cl,pcp)
|
print("MISS",l,cl,pcp)
|
||||||
print(len(testset))
|
|
||||||
print("Precision: {}, Recall: {}".format(true_positive/positive,true_positive/(true_positive+false_negative)))
|
|
||||||
print("Accuracy:")
|
print("Accuracy:")
|
||||||
|
print(len(testset))
|
||||||
acc = gg / len(testset)
|
acc = gg / len(testset)
|
||||||
print(acc)
|
print(acc)
|
||||||
return acc
|
return acc
|
||||||
@ -416,7 +407,6 @@ def link_summary(db,hostname):
|
|||||||
goodcount = 0
|
goodcount = 0
|
||||||
info = {}
|
info = {}
|
||||||
crawled_count = 0
|
crawled_count = 0
|
||||||
bad_crawl_count = 0
|
|
||||||
for item in res:
|
for item in res:
|
||||||
count = item["count"]
|
count = item["count"]
|
||||||
st = item["_id"]
|
st = item["_id"]
|
||||||
@ -425,11 +415,7 @@ def link_summary(db,hostname):
|
|||||||
goodcount += count
|
goodcount += count
|
||||||
if st != "frontlink" and st != "backlink":
|
if st != "frontlink" and st != "backlink":
|
||||||
crawled_count += count
|
crawled_count += count
|
||||||
if st != "good":
|
|
||||||
bad_crawl_count += count
|
|
||||||
info[st] = count
|
info[st] = count
|
||||||
info["crawled_count"] = crawled_count
|
|
||||||
info["bad_crawl_count"] = bad_crawl_count
|
|
||||||
baclink_cout = 0
|
baclink_cout = 0
|
||||||
if "backlink" in info:
|
if "backlink" in info:
|
||||||
backlink_count = info["backlink"]
|
backlink_count = info["backlink"]
|
||||||
@ -483,7 +469,7 @@ def sample_links(db,hostname,status,batch_size):
|
|||||||
for item in res:
|
for item in res:
|
||||||
for item in res:
|
for item in res:
|
||||||
cll = cl.classify(item["url"])
|
cll = cl.classify(item["url"])
|
||||||
#cll += random.uniform(-0.1,0.1)
|
cll += random.uniform(-0.1,0.1)
|
||||||
sample_links.append((item["url"],cll))
|
sample_links.append((item["url"],cll))
|
||||||
if cll > 0:
|
if cll > 0:
|
||||||
predicted_good += 1
|
predicted_good += 1
|
||||||
@ -535,7 +521,6 @@ def createdb():
|
|||||||
htmlcol.create_index("html_md5",unique=True)
|
htmlcol.create_index("html_md5",unique=True)
|
||||||
domaincol = db["domains"]
|
domaincol = db["domains"]
|
||||||
domaincol.create_index("host",unique=True)
|
domaincol.create_index("host",unique=True)
|
||||||
domaincol.create_index("average_fetch_characters",unique=True)
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("link")
|
@click.argument("link")
|
@ -1,5 +0,0 @@
|
|||||||
trafilatura
|
|
||||||
courlan
|
|
||||||
pymongo
|
|
||||||
click
|
|
||||||
lxml
|
|
Loading…
Reference in New Issue
Block a user