From 5b887a13c76a7a3a8fdf99600843d154dd6e5ce0 Mon Sep 17 00:00:00 2001
From: Daniel Hladek <daniel.hladek@tuke.sk>
Date: Thu, 21 Mar 2024 19:36:59 +0100
Subject: [PATCH] zz

---
 mongo/cli.py          | 27 ++++++++++++++++++++++++---
 mongo/mongocrawler.py |  1 -
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/mongo/cli.py b/mongo/cli.py
index 0475f63..4e48c49 100644
--- a/mongo/cli.py
+++ b/mongo/cli.py
@@ -91,15 +91,36 @@ def fetchlinks(start_link):
 @click.argument("hostname")
 def processlinks(hostname):
     rules = mongocrawler.fetch_robot(hostname)
-    outfile = "data.jsonl"
+    dname = "data"
+    outfile = dname + "/data.jsonl"
+    loutfile = dname + "/extracted.links"
+    htmldir = dname + "/html/"
     links = []
+    os.mkdir(dname)
+    os.mkdir(htmldir)
     for line in sys.stdin:
         links.append(line.rstrip())
     extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
+    # save extracted text
     with open(outfile,"w") as of:
         for page in extracted_pages:
-            doc = json.dumps(page)
-            print(page,file=of)
+            url,html,doc = page
+            if "url" in doc and doc["url"] != url:
+                doc["original_url"] = url
+            else:
+                doc["url"] = url
+            import urllib.parse
+            hname = htmldir + urllib.parse.quote(url,safe="")
+            doc["html_filename"] = hname
+            with open(hname,"w") as hf:
+                print(html,file=hf)
+            ddoc = json.dumps(doc)
+            print(ddoc,file=of)
+
+    # save extracted links
+    with open(loutfile,"w") as of:
+        for link in links:
+            print(link,file=of)
 
 
 @cli.command(help="Enqueue a list of links into redis queue for crawling")
diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py
index e323ca3..73e19e3 100644
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@@ -673,7 +673,6 @@ def index_pages(db,hostname,extracted_pages,filter_content):
     save_batch_info(db,hostname,final_states,docs)
 
 def fetch_and_extract(links,rules):
-    print("Processing links")
     responses = []
     for link in links:
         responses.append(fetch_page(link))