diff --git a/mongo/cli.py b/mongo/cli.py index 0475f63..4e48c49 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -91,15 +91,36 @@ def fetchlinks(start_link): @click.argument("hostname") def processlinks(hostname): rules = mongocrawler.fetch_robot(hostname) - outfile = "data.jsonl" + dname = "data" + outfile = dname + "/data.jsonl" + loutfile = dname + "/extracted.links" + htmldir = dname + "/html/" links = [] + os.mkdir(dname) + os.mkdir(htmldir) for line in sys.stdin: links.append(line.rstrip()) extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules) + # save extracted text with open(outfile,"w") as of: for page in extracted_pages: - doc = json.dumps(page) - print(page,file=of) + url,html,doc = page + if "url" in doc and doc["url"] != url: + doc["original_url"] = url + else: + doc["url"] = url + import urllib.parse + hname = htmldir + urllib.parse.quote(url,safe="") + doc["html_filename"] = hname + with open(hname,"w") as hf: + print(html,file=hf) + ddoc = json.dumps(doc) + print(ddoc,file=of) + + # save extracted links + with open(loutfile,"w") as of: + for link in links: + print(link,file=of) @cli.command(help="Enqueue a list of links into redis queue for crawling") diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index e323ca3..73e19e3 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -673,7 +673,6 @@ def index_pages(db,hostname,extracted_pages,filter_content): save_batch_info(db,hostname,final_states,docs) def fetch_and_extract(links,rules): - print("Processing links") responses = [] for link in links: responses.append(fetch_page(link))