From ad52af705b169a1ca2e67bfb72c40a40de3be50b Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Wed, 12 Apr 2023 14:35:35 +0200 Subject: [PATCH] zz --- mongo/Dockerfile | 2 +- mongo/cli.py | 10 +++++--- mongo/mongocrawler.py | 53 +++++++++++++++++++++++++------------------ 3 files changed, 39 insertions(+), 26 deletions(-) diff --git a/mongo/Dockerfile b/mongo/Dockerfile index 8997a47..133ac54 100644 --- a/mongo/Dockerfile +++ b/mongo/Dockerfile @@ -2,7 +2,7 @@ FROM python:3.9 RUN mkdir /app COPY requirements.txt /app RUN pip install -r /app/requirements.txt -COPY *.py /app +COPY *.py /app/ WORKDIR /app # redis config ENV REDIS_URL="redis://localhost:6379/" diff --git a/mongo/cli.py b/mongo/cli.py index bb508d4..6f27215 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -25,9 +25,13 @@ def classify(start_link): mongocrawler.classify(start_link) @cli.command() -@click.argument("start_link") -def visit(start_link): - mongocrawler.visit(start_link) +@click.argument("hostname") +def visit(hostname): + mongocrawler.visit(hostname) + +@cli.command() +def summary(): + mongocrawler.crawl_summary() if __name__ == "__main__": cli() diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 6bcd802..7be8809 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -9,7 +9,8 @@ import trafilatura.external import sys import courlan import urllib -from datetime import datetime +from datetime import datetime as dat +import datetime import click import logging as LOGGER import os @@ -107,7 +108,7 @@ def get_link_doc(link:str,status="frontlink")->dict: assert r is not None link,host = r domain = courlan.extract_domain(link) - return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()} + return {"url":link,"host":host,"domain":domain,"status":status,"created_at":dat.utcnow()} def fetch_page(link:str)->(str,str): @@ -256,7 +257,7 @@ def index_pages(db,hostname,extracted_pages): batchdoc = { "host": linkdoc["host"], "domain": linkdoc["domain"], - "created_at": datetime.utcnow(), + "created_at": dat.utcnow(), "good_document_count":good_document_count, "document_count":document_count, "text_size":text_size, @@ -280,8 +281,7 @@ def get_bs_links(link,html): base = bs.base["href"] base = urllib.parse.urlparse(courlan.normalize_url(base)) - external_links = set() - internal_links = set() + links = set() # Normalizacia linkov for l in bs.find_all("a", href=True): if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs: @@ -305,19 +305,13 @@ def get_bs_links(link,html): if path.endswith(")"): # javascript continue - external = True - if parsed.netloc == base.netloc: - external = False href = urllib.parse.urlunparse((scheme,netloc,path,"","","")) href = courlan.normalize_url(href) - if external: - external_links.add(href) - else: - internal_links.add(href) + links.add(href) except ValueError as err: print(err) pass - return internal_links,external_links + return links def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list: links = {} @@ -326,12 +320,12 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat status = default_status if html is None or len(html) < 256: continue - internal_links, external_links = get_bs_links(final_link,html) + page_links = get_bs_links(final_link,html) #external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE) #internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE) #print(extracted_links) - for link in internal_links: - if not is_robot_good(link,rules): + for link in page_links: + if not courlan.is_external(link,final_link) and not is_robot_good(link,rules): badrobot += 1 continue status = str(default_status) @@ -362,7 +356,7 @@ def index_links(db,extracted_links): pass else: print("updating " + link,status) - linkcol.update_one({"url":link},{"$set":{"status":status,"updated_at":datetime.utcnow()}}) + linkcol.update_one({"url":link},{"$set":{"status":status,"updated_at":dat.utcnow()}}) def get_link_features(link): a, urlpath = courlan.get_host_and_path(link) @@ -501,7 +495,7 @@ def link_summary(db,hostname): res = linkcol.aggregate([ {"$match":{"host":hostname}}, {"$group":{"_id":"$status", - "count":{"$count":{}}, + "count":{"$sum":1}, } }, ]) @@ -551,7 +545,7 @@ def link_summary(db,hostname): info["total_good_characters"] = text_size info["average_good_characters"] = good_document_characters info["average_fetch_characters"] = fetch_average_characters - domaincol = db["domain"] + domaincol = db["domains"] domaincol.update_one({"host":hostname},{"$set":info},upsert=True) res = domaincol.find_one({"host":hostname}) print(res) @@ -634,7 +628,7 @@ def createdb(): htmlcol.create_index("html_md5",unique=True) domaincol = db["domains"] domaincol.create_index("host",unique=True) - domaincol.create_index("average_fetch_characters",unique=True) + domaincol.create_index("average_fetch_characters") batchcol = db["batches"] batchcol.create_index("host") batchcol.create_index("created_at") @@ -684,12 +678,12 @@ def classify(start_link): cl.train(trainset) cl.test(testset) -def visit(start_link): +def visit(hostname): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] - start_link,hostname = courlan.check_url(start_link) batch_size = BATCHSIZE rules = fetch_robot(hostname) + start_link = "https://" + hostname # renew front links front_links = fetch_front_links(start_link,rules) index_links(db,front_links) @@ -711,3 +705,18 @@ def visit(start_link): index_pages(db,hostname,extracted_pages) link_summary(db,hostname) +def crawl_summary(): + myclient = pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] + batchcol = db["batches"] + yesterday = datetime.datetime.today() - datetime.timedelta(days=1) + print(yesterday) + res = batchcol.find({"created_at":{"$gt": yesterday.utcnow()}},limit=20).sort("average_fetch_characters") + print(">>>> Batches") + for item in res: + print(item["url"],item["average_fetch_characters"]) + domaincol = db["domains"] + print(">>>> Best domains") + res = domaincol.find({},limit=100).sort("average_fetch_characters") + for item in res: + print(item)