zz
This commit is contained in:
parent
5d45569651
commit
5b887a13c7
27
mongo/cli.py
27
mongo/cli.py
@ -91,15 +91,36 @@ def fetchlinks(start_link):
|
|||||||
@click.argument("hostname")
|
@click.argument("hostname")
|
||||||
def processlinks(hostname):
|
def processlinks(hostname):
|
||||||
rules = mongocrawler.fetch_robot(hostname)
|
rules = mongocrawler.fetch_robot(hostname)
|
||||||
outfile = "data.jsonl"
|
dname = "data"
|
||||||
|
outfile = dname + "/data.jsonl"
|
||||||
|
loutfile = dname + "/extracted.links"
|
||||||
|
htmldir = dname + "/html/"
|
||||||
links = []
|
links = []
|
||||||
|
os.mkdir(dname)
|
||||||
|
os.mkdir(htmldir)
|
||||||
for line in sys.stdin:
|
for line in sys.stdin:
|
||||||
links.append(line.rstrip())
|
links.append(line.rstrip())
|
||||||
extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
|
extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
|
||||||
|
# save extracted text
|
||||||
with open(outfile,"w") as of:
|
with open(outfile,"w") as of:
|
||||||
for page in extracted_pages:
|
for page in extracted_pages:
|
||||||
doc = json.dumps(page)
|
url,html,doc = page
|
||||||
print(page,file=of)
|
if "url" in doc and doc["url"] != url:
|
||||||
|
doc["original_url"] = url
|
||||||
|
else:
|
||||||
|
doc["url"] = url
|
||||||
|
import urllib.parse
|
||||||
|
hname = htmldir + urllib.parse.quote(url,safe="")
|
||||||
|
doc["html_filename"] = hname
|
||||||
|
with open(hname,"w") as hf:
|
||||||
|
print(html,file=hf)
|
||||||
|
ddoc = json.dumps(doc)
|
||||||
|
print(ddoc,file=of)
|
||||||
|
|
||||||
|
# save extracted links
|
||||||
|
with open(loutfile,"w") as of:
|
||||||
|
for link in links:
|
||||||
|
print(link,file=of)
|
||||||
|
|
||||||
|
|
||||||
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
||||||
|
@ -673,7 +673,6 @@ def index_pages(db,hostname,extracted_pages,filter_content):
|
|||||||
save_batch_info(db,hostname,final_states,docs)
|
save_batch_info(db,hostname,final_states,docs)
|
||||||
|
|
||||||
def fetch_and_extract(links,rules):
|
def fetch_and_extract(links,rules):
|
||||||
print("Processing links")
|
|
||||||
responses = []
|
responses = []
|
||||||
for link in links:
|
for link in links:
|
||||||
responses.append(fetch_page(link))
|
responses.append(fetch_page(link))
|
||||||
|
Loading…
Reference in New Issue
Block a user