This commit is contained in:
Daniel Hládek 2023-04-30 13:54:36 +02:00
parent f78a64d4e8
commit f5a3b03874
2 changed files with 18 additions and 2 deletions

View File

@ -9,26 +9,35 @@ REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
QUEUES=os.getenv("QUEUES","high,default,low")
@click.group()
def cli():
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
def cli(dbname):
DBNAME=dbname
pass
@cli.command()
def createdb():
mongocrawler.createdb()
@cli.command()
def dropdb():
mongocrawler.dropdb()
@cli.command()
@click.argument("link")
def parseurl(link):
""" Parse document on link for debug """
mongocrawler.parseurl(link)
@cli.command()
@click.argument("link")
def externaldomains(link):
""" Extract external domains from link """
mongocrawler.externaldomains(link)
@cli.command()
@click.argument("start_link")
def classify(start_link):
""" domain to to classify links for debug """
mongocrawler.classify(start_link)
@cli.command()

View File

@ -87,6 +87,7 @@ def get_bs_links(link,html):
print(err)
pass
return links
def split_train(res):
trainset = []
testset = []
@ -209,7 +210,7 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
def extract_page(final_link,html):
doc = None
if html is not None:
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE,favor_precision=True)
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
if doc is not None:
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
# text too small
@ -602,6 +603,12 @@ def domain_summary(db,hostname):
for item in res:
print(item)
def dropdb():
myclient = pymongo.MongoClient(CONNECTION)
print("write name of database to drop")
dbname = sys.stdin.readline().strip()
myclient.drop_database(dbname)
def createdb():
myclient = pymongo.MongoClient(CONNECTION)