zz

2024-03-21 19:36:59 +01:00 · 2024-03-21 19:36:59 +01:00 · 5b887a13c7
commit 5b887a13c7
parent 5d45569651
2 changed files with 24 additions and 4 deletions
--- a/mongo/cli.py
+++ b/mongo/cli.py
@ -91,15 +91,36 @@ def fetchlinks(start_link):
@click.argument("hostname")
 def processlinks(hostname):
    rules = mongocrawler.fetch_robot(hostname)
-    outfile = "data.jsonl"
+    dname = "data"
    outfile = dname + "/data.jsonl"
    loutfile = dname + "/extracted.links"
    htmldir = dname + "/html/"
    links = []
    os.mkdir(dname)
    os.mkdir(htmldir)
    for line in sys.stdin:
        links.append(line.rstrip())
    extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
    # save extracted text
    with open(outfile,"w") as of:
        for page in extracted_pages:
-            doc = json.dumps(page)
+            url,html,doc = page
-            print(page,file=of)
+            if "url" in doc and doc["url"] != url:
                doc["original_url"] = url
            else:
                doc["url"] = url
            import urllib.parse
            hname = htmldir + urllib.parse.quote(url,safe="")
            doc["html_filename"] = hname
            with open(hname,"w") as hf:
                print(html,file=hf)
            ddoc = json.dumps(doc)
            print(ddoc,file=of)
    # save extracted links
    with open(loutfile,"w") as of:
        for link in links:
            print(link,file=of)
@cli.command(help="Enqueue a list of links into redis queue for crawling")
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@ -673,7 +673,6 @@ def index_pages(db,hostname,extracted_pages,filter_content):
    save_batch_info(db,hostname,final_states,docs)
 def fetch_and_extract(links,rules):
    print("Processing links")
    responses = []
    for link in links:
        responses.append(fetch_page(link))