From 8a91f88d739cbd78b5ea222e35bb79d307e00ace Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Sun, 23 Apr 2023 10:02:52 +0200 Subject: [PATCH 1/3] zz --- mongo/cli.py | 3 ++- mongo/mongocrawler.py | 15 +++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/mongo/cli.py b/mongo/cli.py index d3ebcaa..66cc1e7 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -32,9 +32,10 @@ def classify(start_link): mongocrawler.classify(start_link) @cli.command() -@click.argument("hostname",help="Hostname to crawl") +@click.argument("hostname") @click.option("--filter_content",default=True,help="Filter content") def visit(hostname,filter_content=True): + """ Hostname to crawl """ mongocrawler.visit(hostname,filter_content=filter_content) @cli.command() diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index ba14123..2180ad2 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -99,7 +99,9 @@ def split_train(res): def calculate_checksums(text): """ - @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line + Paragraph separation must be compatible with text extraction. Are paragraphs separated with a blank line or a white line? + + @return fingerprints of a paragraphs in text. Paragraphs are separated by a new line. """ checksums = [] sizes = [] @@ -153,11 +155,11 @@ def is_link_good(link): return llink def get_link_doc(link:str,status="frontlink")->dict: - r = courlan.check_url(link) - assert r is not None - link,host = r - domain = courlan.extract_domain(link) - return {"url":link,"host":host,"domain":domain,"status":status,"created_at":dat.utcnow()} + parsed = urllib.parse.urlparse(courlan.normalize_url(link)) + url = urllib.parse.urlunparse(parsed) + tokens = parsed.netloc.split(".") + domain = tokens[-2] + "." + tokens[-1] + return {"url":link,"host":parsed.netloc,"domain":domain,"status":status,"created_at":dat.utcnow()} def fetch_page(link:str)->(str,str): @@ -738,6 +740,7 @@ def import_html(): for l in sys.stdin: hdoc = json.loads(l) url = hdoc["url"] + # beautifusoup is to unify encoding html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify() doc = extract_page(url,html) if doc is not None: From f78a64d4e80b68ec78acc8609129cb28e164fde3 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Thu, 27 Apr 2023 07:29:15 +0200 Subject: [PATCH 2/3] zz --- mongo/mongocrawler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 2180ad2..3ab4478 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -209,7 +209,7 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser: def extract_page(final_link,html): doc = None if html is not None: - doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True) + doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE,favor_precision=True) if doc is not None: if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE: # text too small @@ -619,7 +619,7 @@ def createdb(): htmlcol.create_index("html_md5",unique=True) domaincol = db["domains"] domaincol.create_index("host",unique=True) - domaincol.create_index(("average_fetch_characters",pymongo.DESCENDING)) + domaincol.create_index([("average_fetch_characters",pymongo.DESCENDING)]) batchcol = db["batches"] batchcol.create_index("host") batchcol.create_index("created_at") @@ -744,6 +744,7 @@ def import_html(): html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify() doc = extract_page(url,html) if doc is not None: + print("------=====-") print(doc) status = index_page(db,url,url,html,doc) print(status) From f5a3b03874c472494230cfd97b69af47dc57dac9 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Sun, 30 Apr 2023 13:54:36 +0200 Subject: [PATCH 3/3] zz --- mongo/cli.py | 11 ++++++++++- mongo/mongocrawler.py | 9 ++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/mongo/cli.py b/mongo/cli.py index 66cc1e7..8dc2d7f 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -9,26 +9,35 @@ REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/") QUEUES=os.getenv("QUEUES","high,default,low") @click.group() -def cli(): +@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use") +def cli(dbname): + DBNAME=dbname pass @cli.command() def createdb(): mongocrawler.createdb() +@cli.command() +def dropdb(): + mongocrawler.dropdb() + @cli.command() @click.argument("link") def parseurl(link): + """ Parse document on link for debug """ mongocrawler.parseurl(link) @cli.command() @click.argument("link") def externaldomains(link): + """ Extract external domains from link """ mongocrawler.externaldomains(link) @cli.command() @click.argument("start_link") def classify(start_link): + """ domain to to classify links for debug """ mongocrawler.classify(start_link) @cli.command() diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 3ab4478..24e9ef1 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -87,6 +87,7 @@ def get_bs_links(link,html): print(err) pass return links + def split_train(res): trainset = [] testset = [] @@ -209,7 +210,7 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser: def extract_page(final_link,html): doc = None if html is not None: - doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE,favor_precision=True) + doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True) if doc is not None: if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE: # text too small @@ -602,6 +603,12 @@ def domain_summary(db,hostname): for item in res: print(item) +def dropdb(): + myclient = pymongo.MongoClient(CONNECTION) + print("write name of database to drop") + dbname = sys.stdin.readline().strip() + myclient.drop_database(dbname) + def createdb(): myclient = pymongo.MongoClient(CONNECTION)