zz
This commit is contained in:
commit
a7d048c952
14
mongo/cli.py
14
mongo/cli.py
@ -9,32 +9,42 @@ REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
|
|||||||
QUEUES=os.getenv("QUEUES","high,default,low")
|
QUEUES=os.getenv("QUEUES","high,default,low")
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
def cli():
|
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
|
||||||
|
def cli(dbname):
|
||||||
|
DBNAME=dbname
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
def createdb():
|
def createdb():
|
||||||
mongocrawler.createdb()
|
mongocrawler.createdb()
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
def dropdb():
|
||||||
|
mongocrawler.dropdb()
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("link")
|
@click.argument("link")
|
||||||
def parseurl(link):
|
def parseurl(link):
|
||||||
|
""" Parse document on link for debug """
|
||||||
mongocrawler.parseurl(link)
|
mongocrawler.parseurl(link)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("link")
|
@click.argument("link")
|
||||||
def externaldomains(link):
|
def externaldomains(link):
|
||||||
|
""" Extract external domains from link """
|
||||||
mongocrawler.externaldomains(link)
|
mongocrawler.externaldomains(link)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("start_link")
|
@click.argument("start_link")
|
||||||
def classify(start_link):
|
def classify(start_link):
|
||||||
|
""" domain to to classify links for debug """
|
||||||
mongocrawler.classify(start_link)
|
mongocrawler.classify(start_link)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("hostname",help="Hostname to crawl")
|
@click.argument("hostname")
|
||||||
@click.option("--filter_content",default=True,help="Filter content")
|
@click.option("--filter_content",default=True,help="Filter content")
|
||||||
def visit(hostname,filter_content=True):
|
def visit(hostname,filter_content=True):
|
||||||
|
""" Hostname to crawl """
|
||||||
mongocrawler.visit(hostname,filter_content=filter_content)
|
mongocrawler.visit(hostname,filter_content=filter_content)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
|
@ -87,6 +87,7 @@ def get_bs_links(link,html):
|
|||||||
print(err)
|
print(err)
|
||||||
pass
|
pass
|
||||||
return links
|
return links
|
||||||
|
|
||||||
def split_train(res):
|
def split_train(res):
|
||||||
trainset = []
|
trainset = []
|
||||||
testset = []
|
testset = []
|
||||||
@ -99,7 +100,9 @@ def split_train(res):
|
|||||||
|
|
||||||
def calculate_checksums(text):
|
def calculate_checksums(text):
|
||||||
"""
|
"""
|
||||||
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
|
Paragraph separation must be compatible with text extraction. Are paragraphs separated with a blank line or a white line?
|
||||||
|
|
||||||
|
@return fingerprints of a paragraphs in text. Paragraphs are separated by a new line.
|
||||||
"""
|
"""
|
||||||
checksums = []
|
checksums = []
|
||||||
sizes = []
|
sizes = []
|
||||||
@ -153,11 +156,11 @@ def is_link_good(link):
|
|||||||
return llink
|
return llink
|
||||||
|
|
||||||
def get_link_doc(link:str,status="frontlink")->dict:
|
def get_link_doc(link:str,status="frontlink")->dict:
|
||||||
r = courlan.check_url(link)
|
parsed = urllib.parse.urlparse(courlan.normalize_url(link))
|
||||||
assert r is not None
|
url = urllib.parse.urlunparse(parsed)
|
||||||
link,host = r
|
tokens = parsed.netloc.split(".")
|
||||||
domain = courlan.extract_domain(link)
|
domain = tokens[-2] + "." + tokens[-1]
|
||||||
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":dat.utcnow()}
|
return {"url":link,"host":parsed.netloc,"domain":domain,"status":status,"created_at":dat.utcnow()}
|
||||||
|
|
||||||
|
|
||||||
def fetch_page(link:str)->(str,str):
|
def fetch_page(link:str)->(str,str):
|
||||||
@ -600,6 +603,12 @@ def domain_summary(db,hostname):
|
|||||||
for item in res:
|
for item in res:
|
||||||
print(item)
|
print(item)
|
||||||
|
|
||||||
|
def dropdb():
|
||||||
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
|
print("write name of database to drop")
|
||||||
|
dbname = sys.stdin.readline().strip()
|
||||||
|
myclient.drop_database(dbname)
|
||||||
|
|
||||||
|
|
||||||
def createdb():
|
def createdb():
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
@ -617,7 +626,7 @@ def createdb():
|
|||||||
htmlcol.create_index("html_md5",unique=True)
|
htmlcol.create_index("html_md5",unique=True)
|
||||||
domaincol = db["domains"]
|
domaincol = db["domains"]
|
||||||
domaincol.create_index("host",unique=True)
|
domaincol.create_index("host",unique=True)
|
||||||
domaincol.create_index(("average_fetch_characters",pymongo.DESCENDING))
|
domaincol.create_index([("average_fetch_characters",pymongo.DESCENDING)])
|
||||||
batchcol = db["batches"]
|
batchcol = db["batches"]
|
||||||
batchcol.create_index("host")
|
batchcol.create_index("host")
|
||||||
batchcol.create_index("created_at")
|
batchcol.create_index("created_at")
|
||||||
@ -745,9 +754,11 @@ def import_html():
|
|||||||
for l in sys.stdin:
|
for l in sys.stdin:
|
||||||
hdoc = json.loads(l)
|
hdoc = json.loads(l)
|
||||||
url = hdoc["url"]
|
url = hdoc["url"]
|
||||||
|
# beautifusoup is to unify encoding
|
||||||
html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
|
html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
|
||||||
doc = extract_page(url,html)
|
doc = extract_page(url,html)
|
||||||
if doc is not None:
|
if doc is not None:
|
||||||
|
print("------=====-")
|
||||||
print(doc)
|
print(doc)
|
||||||
status = index_page(db,url,url,html,doc)
|
status = index_page(db,url,url,html,doc)
|
||||||
print(status)
|
print(status)
|
||||||
|
Loading…
Reference in New Issue
Block a user