This commit is contained in:
Daniel Hládek 2023-05-12 08:11:33 +02:00
commit a7d048c952
2 changed files with 30 additions and 9 deletions

View File

@ -9,32 +9,42 @@ REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
QUEUES=os.getenv("QUEUES","high,default,low") QUEUES=os.getenv("QUEUES","high,default,low")
@click.group() @click.group()
def cli(): @click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
def cli(dbname):
DBNAME=dbname
pass pass
@cli.command() @cli.command()
def createdb(): def createdb():
mongocrawler.createdb() mongocrawler.createdb()
@cli.command()
def dropdb():
mongocrawler.dropdb()
@cli.command() @cli.command()
@click.argument("link") @click.argument("link")
def parseurl(link): def parseurl(link):
""" Parse document on link for debug """
mongocrawler.parseurl(link) mongocrawler.parseurl(link)
@cli.command() @cli.command()
@click.argument("link") @click.argument("link")
def externaldomains(link): def externaldomains(link):
""" Extract external domains from link """
mongocrawler.externaldomains(link) mongocrawler.externaldomains(link)
@cli.command() @cli.command()
@click.argument("start_link") @click.argument("start_link")
def classify(start_link): def classify(start_link):
""" domain to to classify links for debug """
mongocrawler.classify(start_link) mongocrawler.classify(start_link)
@cli.command() @cli.command()
@click.argument("hostname",help="Hostname to crawl") @click.argument("hostname")
@click.option("--filter_content",default=True,help="Filter content") @click.option("--filter_content",default=True,help="Filter content")
def visit(hostname,filter_content=True): def visit(hostname,filter_content=True):
""" Hostname to crawl """
mongocrawler.visit(hostname,filter_content=filter_content) mongocrawler.visit(hostname,filter_content=filter_content)
@cli.command() @cli.command()

View File

@ -87,6 +87,7 @@ def get_bs_links(link,html):
print(err) print(err)
pass pass
return links return links
def split_train(res): def split_train(res):
trainset = [] trainset = []
testset = [] testset = []
@ -99,7 +100,9 @@ def split_train(res):
def calculate_checksums(text): def calculate_checksums(text):
""" """
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line Paragraph separation must be compatible with text extraction. Are paragraphs separated with a blank line or a white line?
@return fingerprints of a paragraphs in text. Paragraphs are separated by a new line.
""" """
checksums = [] checksums = []
sizes = [] sizes = []
@ -153,11 +156,11 @@ def is_link_good(link):
return llink return llink
def get_link_doc(link:str,status="frontlink")->dict: def get_link_doc(link:str,status="frontlink")->dict:
r = courlan.check_url(link) parsed = urllib.parse.urlparse(courlan.normalize_url(link))
assert r is not None url = urllib.parse.urlunparse(parsed)
link,host = r tokens = parsed.netloc.split(".")
domain = courlan.extract_domain(link) domain = tokens[-2] + "." + tokens[-1]
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":dat.utcnow()} return {"url":link,"host":parsed.netloc,"domain":domain,"status":status,"created_at":dat.utcnow()}
def fetch_page(link:str)->(str,str): def fetch_page(link:str)->(str,str):
@ -600,6 +603,12 @@ def domain_summary(db,hostname):
for item in res: for item in res:
print(item) print(item)
def dropdb():
myclient = pymongo.MongoClient(CONNECTION)
print("write name of database to drop")
dbname = sys.stdin.readline().strip()
myclient.drop_database(dbname)
def createdb(): def createdb():
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
@ -617,7 +626,7 @@ def createdb():
htmlcol.create_index("html_md5",unique=True) htmlcol.create_index("html_md5",unique=True)
domaincol = db["domains"] domaincol = db["domains"]
domaincol.create_index("host",unique=True) domaincol.create_index("host",unique=True)
domaincol.create_index(("average_fetch_characters",pymongo.DESCENDING)) domaincol.create_index([("average_fetch_characters",pymongo.DESCENDING)])
batchcol = db["batches"] batchcol = db["batches"]
batchcol.create_index("host") batchcol.create_index("host")
batchcol.create_index("created_at") batchcol.create_index("created_at")
@ -745,9 +754,11 @@ def import_html():
for l in sys.stdin: for l in sys.stdin:
hdoc = json.loads(l) hdoc = json.loads(l)
url = hdoc["url"] url = hdoc["url"]
# beautifusoup is to unify encoding
html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify() html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
doc = extract_page(url,html) doc = extract_page(url,html)
if doc is not None: if doc is not None:
print("------=====-")
print(doc) print(doc)
status = index_page(db,url,url,html,doc) status = index_page(db,url,url,html,doc)
print(status) print(status)