zz
This commit is contained in:
parent
01645b8862
commit
8a91f88d73
@ -32,9 +32,10 @@ def classify(start_link):
|
||||
mongocrawler.classify(start_link)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("hostname",help="Hostname to crawl")
|
||||
@click.argument("hostname")
|
||||
@click.option("--filter_content",default=True,help="Filter content")
|
||||
def visit(hostname,filter_content=True):
|
||||
""" Hostname to crawl """
|
||||
mongocrawler.visit(hostname,filter_content=filter_content)
|
||||
|
||||
@cli.command()
|
||||
|
@ -99,7 +99,9 @@ def split_train(res):
|
||||
|
||||
def calculate_checksums(text):
|
||||
"""
|
||||
@return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
|
||||
Paragraph separation must be compatible with text extraction. Are paragraphs separated with a blank line or a white line?
|
||||
|
||||
@return fingerprints of a paragraphs in text. Paragraphs are separated by a new line.
|
||||
"""
|
||||
checksums = []
|
||||
sizes = []
|
||||
@ -153,11 +155,11 @@ def is_link_good(link):
|
||||
return llink
|
||||
|
||||
def get_link_doc(link:str,status="frontlink")->dict:
|
||||
r = courlan.check_url(link)
|
||||
assert r is not None
|
||||
link,host = r
|
||||
domain = courlan.extract_domain(link)
|
||||
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":dat.utcnow()}
|
||||
parsed = urllib.parse.urlparse(courlan.normalize_url(link))
|
||||
url = urllib.parse.urlunparse(parsed)
|
||||
tokens = parsed.netloc.split(".")
|
||||
domain = tokens[-2] + "." + tokens[-1]
|
||||
return {"url":link,"host":parsed.netloc,"domain":domain,"status":status,"created_at":dat.utcnow()}
|
||||
|
||||
|
||||
def fetch_page(link:str)->(str,str):
|
||||
@ -738,6 +740,7 @@ def import_html():
|
||||
for l in sys.stdin:
|
||||
hdoc = json.loads(l)
|
||||
url = hdoc["url"]
|
||||
# beautifusoup is to unify encoding
|
||||
html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
|
||||
doc = extract_page(url,html)
|
||||
if doc is not None:
|
||||
|
Loading…
Reference in New Issue
Block a user