From f5a3b03874c472494230cfd97b69af47dc57dac9 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Sun, 30 Apr 2023 13:54:36 +0200 Subject: [PATCH] zz --- mongo/cli.py | 11 ++++++++++- mongo/mongocrawler.py | 9 ++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/mongo/cli.py b/mongo/cli.py index 66cc1e7..8dc2d7f 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -9,26 +9,35 @@ REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/") QUEUES=os.getenv("QUEUES","high,default,low") @click.group() -def cli(): +@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use") +def cli(dbname): + DBNAME=dbname pass @cli.command() def createdb(): mongocrawler.createdb() +@cli.command() +def dropdb(): + mongocrawler.dropdb() + @cli.command() @click.argument("link") def parseurl(link): + """ Parse document on link for debug """ mongocrawler.parseurl(link) @cli.command() @click.argument("link") def externaldomains(link): + """ Extract external domains from link """ mongocrawler.externaldomains(link) @cli.command() @click.argument("start_link") def classify(start_link): + """ domain to to classify links for debug """ mongocrawler.classify(start_link) @cli.command() diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 3ab4478..24e9ef1 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -87,6 +87,7 @@ def get_bs_links(link,html): print(err) pass return links + def split_train(res): trainset = [] testset = [] @@ -209,7 +210,7 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser: def extract_page(final_link,html): doc = None if html is not None: - doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE,favor_precision=True) + doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True) if doc is not None: if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE: # text too small @@ -602,6 +603,12 @@ def domain_summary(db,hostname): for item in res: print(item) +def dropdb(): + myclient = pymongo.MongoClient(CONNECTION) + print("write name of database to drop") + dbname = sys.stdin.readline().strip() + myclient.drop_database(dbname) + def createdb(): myclient = pymongo.MongoClient(CONNECTION)