This commit is contained in:
Daniel Hládek 2023-04-30 13:54:36 +02:00
parent f78a64d4e8
commit f5a3b03874
2 changed files with 18 additions and 2 deletions

View File

@ -9,26 +9,35 @@ REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
QUEUES=os.getenv("QUEUES","high,default,low") QUEUES=os.getenv("QUEUES","high,default,low")
@click.group() @click.group()
def cli(): @click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
def cli(dbname):
DBNAME=dbname
pass pass
@cli.command() @cli.command()
def createdb(): def createdb():
mongocrawler.createdb() mongocrawler.createdb()
@cli.command()
def dropdb():
mongocrawler.dropdb()
@cli.command() @cli.command()
@click.argument("link") @click.argument("link")
def parseurl(link): def parseurl(link):
""" Parse document on link for debug """
mongocrawler.parseurl(link) mongocrawler.parseurl(link)
@cli.command() @cli.command()
@click.argument("link") @click.argument("link")
def externaldomains(link): def externaldomains(link):
""" Extract external domains from link """
mongocrawler.externaldomains(link) mongocrawler.externaldomains(link)
@cli.command() @cli.command()
@click.argument("start_link") @click.argument("start_link")
def classify(start_link): def classify(start_link):
""" domain to to classify links for debug """
mongocrawler.classify(start_link) mongocrawler.classify(start_link)
@cli.command() @cli.command()

View File

@ -87,6 +87,7 @@ def get_bs_links(link,html):
print(err) print(err)
pass pass
return links return links
def split_train(res): def split_train(res):
trainset = [] trainset = []
testset = [] testset = []
@ -209,7 +210,7 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
def extract_page(final_link,html): def extract_page(final_link,html):
doc = None doc = None
if html is not None: if html is not None:
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE,favor_precision=True) doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
if doc is not None: if doc is not None:
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE: if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
# text too small # text too small
@ -602,6 +603,12 @@ def domain_summary(db,hostname):
for item in res: for item in res:
print(item) print(item)
def dropdb():
myclient = pymongo.MongoClient(CONNECTION)
print("write name of database to drop")
dbname = sys.stdin.readline().strip()
myclient.drop_database(dbname)
def createdb(): def createdb():
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)