diff --git a/mongo/Dockerfile b/mongo/Dockerfile index 21df1d3..3dfd9ce 100644 --- a/mongo/Dockerfile +++ b/mongo/Dockerfile @@ -1,6 +1,7 @@ FROM python:3.9 RUN mkdir /app COPY requirements.txt /app -RUN pip install -r requirements.txt +RUN pip install -r /app/requirements.txt COPY *.py /app WORKDIR /app +ENTRYPOINT ["python", "./mongocrawler.py"] diff --git a/mongo/mongocwarler.py b/mongo/mongocrawler.py similarity index 96% rename from mongo/mongocwarler.py rename to mongo/mongocrawler.py index db781a0..9eb738d 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocrawler.py @@ -278,8 +278,7 @@ def get_link_features(link): for i,feature in enumerate(features): if len(feature) < 1: continue - if feature.isdigit(): - feature = "" + feature = re.sub("[0-9]","*",feature) res.append(str(i)+ "-" + feature) if len(res) < 2: return None @@ -322,6 +321,9 @@ class LinkClassifier: def test(self,testset): # eval gg = 0 + true_positive = 0 + positive = 0 + false_negative = 0 for item in testset: l = item["url"] cl = 0 @@ -331,12 +333,19 @@ class LinkClassifier: r = 0 if pcp > 0: r = 1 + if cl == 1: + if r == 1: + true_positive += 1 + positive += 1 + if r == 1 and cl == 0: + false_negative += 1 if r == cl: gg += 1 else: print("MISS",l,cl,pcp) - print("Accuracy:") print(len(testset)) + print("Precision: {}, Recall: {}".format(true_positive/positive,true_positive/(true_positive+false_negative))) + print("Accuracy:") acc = gg / len(testset) print(acc) return acc @@ -407,6 +416,7 @@ def link_summary(db,hostname): goodcount = 0 info = {} crawled_count = 0 + bad_crawl_count = 0 for item in res: count = item["count"] st = item["_id"] @@ -415,7 +425,11 @@ def link_summary(db,hostname): goodcount += count if st != "frontlink" and st != "backlink": crawled_count += count + if st != "good": + bad_crawl_count += count info[st] = count + info["crawled_count"] = crawled_count + info["bad_crawl_count"] = bad_crawl_count baclink_cout = 0 if "backlink" in info: backlink_count = info["backlink"] @@ -469,7 +483,7 @@ def sample_links(db,hostname,status,batch_size): for item in res: for item in res: cll = cl.classify(item["url"]) - cll += random.uniform(-0.1,0.1) + #cll += random.uniform(-0.1,0.1) sample_links.append((item["url"],cll)) if cll > 0: predicted_good += 1 @@ -521,6 +535,7 @@ def createdb(): htmlcol.create_index("html_md5",unique=True) domaincol = db["domains"] domaincol.create_index("host",unique=True) + domaincol.create_index("average_fetch_characters",unique=True) @cli.command() @click.argument("link")