zz
This commit is contained in:
parent
5d45569651
commit
5b887a13c7
27
mongo/cli.py
27
mongo/cli.py
@ -91,15 +91,36 @@ def fetchlinks(start_link):
|
||||
@click.argument("hostname")
|
||||
def processlinks(hostname):
|
||||
rules = mongocrawler.fetch_robot(hostname)
|
||||
outfile = "data.jsonl"
|
||||
dname = "data"
|
||||
outfile = dname + "/data.jsonl"
|
||||
loutfile = dname + "/extracted.links"
|
||||
htmldir = dname + "/html/"
|
||||
links = []
|
||||
os.mkdir(dname)
|
||||
os.mkdir(htmldir)
|
||||
for line in sys.stdin:
|
||||
links.append(line.rstrip())
|
||||
extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
|
||||
# save extracted text
|
||||
with open(outfile,"w") as of:
|
||||
for page in extracted_pages:
|
||||
doc = json.dumps(page)
|
||||
print(page,file=of)
|
||||
url,html,doc = page
|
||||
if "url" in doc and doc["url"] != url:
|
||||
doc["original_url"] = url
|
||||
else:
|
||||
doc["url"] = url
|
||||
import urllib.parse
|
||||
hname = htmldir + urllib.parse.quote(url,safe="")
|
||||
doc["html_filename"] = hname
|
||||
with open(hname,"w") as hf:
|
||||
print(html,file=hf)
|
||||
ddoc = json.dumps(doc)
|
||||
print(ddoc,file=of)
|
||||
|
||||
# save extracted links
|
||||
with open(loutfile,"w") as of:
|
||||
for link in links:
|
||||
print(link,file=of)
|
||||
|
||||
|
||||
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
||||
|
@ -673,7 +673,6 @@ def index_pages(db,hostname,extracted_pages,filter_content):
|
||||
save_batch_info(db,hostname,final_states,docs)
|
||||
|
||||
def fetch_and_extract(links,rules):
|
||||
print("Processing links")
|
||||
responses = []
|
||||
for link in links:
|
||||
responses.append(fetch_page(link))
|
||||
|
Loading…
Reference in New Issue
Block a user