zz

2024-03-21 19:36:59 +01:00 · 2024-03-21 19:36:59 +01:00 · 5b887a13c7
commit 5b887a13c7
parent 5d45569651
2 changed files with 24 additions and 4 deletions
--- a/mongo/cli.py
+++ b/mongo/cli.py
@ -91,15 +91,36 @@ def fetchlinks(start_link):
@click.argument("hostname")
 def processlinks(hostname):
    rules = mongocrawler.fetch_robot(hostname)
-    outfile = "data.jsonl"
+    dname = "data"
+    outfile = dname + "/data.jsonl"
+    loutfile = dname + "/extracted.links"
+    htmldir = dname + "/html/"
    links = []
+    os.mkdir(dname)
+    os.mkdir(htmldir)
    for line in sys.stdin:
        links.append(line.rstrip())
    extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
+    # save extracted text
    with open(outfile,"w") as of:
        for page in extracted_pages:
-            doc = json.dumps(page)
-            print(page,file=of)
+            url,html,doc = page
+            if "url" in doc and doc["url"] != url:
+                doc["original_url"] = url
+            else:
+                doc["url"] = url
+            import urllib.parse
+            hname = htmldir + urllib.parse.quote(url,safe="")
+            doc["html_filename"] = hname
+            with open(hname,"w") as hf:
+                print(html,file=hf)
+            ddoc = json.dumps(doc)
+            print(ddoc,file=of)
+
+    # save extracted links
+    with open(loutfile,"w") as of:
+        for link in links:
+            print(link,file=of)


@cli.command(help="Enqueue a list of links into redis queue for crawling")
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@ -673,7 +673,6 @@ def index_pages(db,hostname,extracted_pages,filter_content):
    save_batch_info(db,hostname,final_states,docs)

 def fetch_and_extract(links,rules):
-    print("Processing links")
    responses = []
    for link in links:
        responses.append(fetch_page(link))