zz
This commit is contained in:
parent
f78a64d4e8
commit
f5a3b03874
11
mongo/cli.py
11
mongo/cli.py
@ -9,26 +9,35 @@ REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
|
|||||||
QUEUES=os.getenv("QUEUES","high,default,low")
|
QUEUES=os.getenv("QUEUES","high,default,low")
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
def cli():
|
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
|
||||||
|
def cli(dbname):
|
||||||
|
DBNAME=dbname
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
def createdb():
|
def createdb():
|
||||||
mongocrawler.createdb()
|
mongocrawler.createdb()
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
def dropdb():
|
||||||
|
mongocrawler.dropdb()
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("link")
|
@click.argument("link")
|
||||||
def parseurl(link):
|
def parseurl(link):
|
||||||
|
""" Parse document on link for debug """
|
||||||
mongocrawler.parseurl(link)
|
mongocrawler.parseurl(link)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("link")
|
@click.argument("link")
|
||||||
def externaldomains(link):
|
def externaldomains(link):
|
||||||
|
""" Extract external domains from link """
|
||||||
mongocrawler.externaldomains(link)
|
mongocrawler.externaldomains(link)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("start_link")
|
@click.argument("start_link")
|
||||||
def classify(start_link):
|
def classify(start_link):
|
||||||
|
""" domain to to classify links for debug """
|
||||||
mongocrawler.classify(start_link)
|
mongocrawler.classify(start_link)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
|
@ -87,6 +87,7 @@ def get_bs_links(link,html):
|
|||||||
print(err)
|
print(err)
|
||||||
pass
|
pass
|
||||||
return links
|
return links
|
||||||
|
|
||||||
def split_train(res):
|
def split_train(res):
|
||||||
trainset = []
|
trainset = []
|
||||||
testset = []
|
testset = []
|
||||||
@ -209,7 +210,7 @@ def fetch_robot(base_url:str)->urllib.robotparser.RobotFileParser:
|
|||||||
def extract_page(final_link,html):
|
def extract_page(final_link,html):
|
||||||
doc = None
|
doc = None
|
||||||
if html is not None:
|
if html is not None:
|
||||||
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE,favor_precision=True)
|
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
|
||||||
if doc is not None:
|
if doc is not None:
|
||||||
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
|
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
|
||||||
# text too small
|
# text too small
|
||||||
@ -602,6 +603,12 @@ def domain_summary(db,hostname):
|
|||||||
for item in res:
|
for item in res:
|
||||||
print(item)
|
print(item)
|
||||||
|
|
||||||
|
def dropdb():
|
||||||
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
|
print("write name of database to drop")
|
||||||
|
dbname = sys.stdin.readline().strip()
|
||||||
|
myclient.drop_database(dbname)
|
||||||
|
|
||||||
|
|
||||||
def createdb():
|
def createdb():
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
|
Loading…
Reference in New Issue
Block a user