zz

2022-02-18 13:58:05 +01:00 · 2022-02-18 13:58:05 +01:00 · d37bafb666
commit d37bafb666
parent b75336c8a6
1 changed files with 24 additions and 0 deletions
--- a/websucker/dump.py
+++ b/websucker/dump.py
@ -4,6 +4,7 @@ import cassandra.query
 import json
 import datetime
 import sys
+from urllib.parse import urlparse

 cassandra_host = sys.argv[1]
 cassandra_port = sys.argv[2]
@ -16,9 +17,30 @@ select_documents = "select json * from content"
 with cluster.connect(keyspace) as session:
    #session.row_factory = cassandra.query.dict_factory
    select_html = session.prepare("select json * from html where day=? and domain_name=? LIMIT 1")
+    select_link = session.prepare("select link_status from links where domain_name=? and url_path=? and url_query=? LIMIT 1")
    rows = session.execute(select_documents)
    for row in rows:
        doc = json.loads(row["[json]"])
+        target_link = doc["target_link"]
+        parsed_link = urlparse(target_link)
+        netloc = parsed_link[1].strip().lower()
+        path = parsed_link[2].strip()
+        # strip leading /
+        if len(path) > 1 and path[0] == "/":
+            path = path[1:]
+        query = parsed_link[4]
+        lrows = session.execute(select_link,(netloc,path,query))
+        status = None
+        for l in lrows:
+            status = str(l["link_status"])
+            break
+        #assert status is not None
+        if status is None:
+            continue
+        #print(status)
+        # skip bad links
+        if not status == "good":
+            continue
        dt = doc["update_time"]
        d = dt.split()[0]
        hrows = session.execute(select_html,(d,doc["domain_name"]))
@ -27,4 +49,6 @@ with cluster.connect(keyspace) as session:
            html = json.loads(h["[json]"])
            break
        doc["html_data"] = html
+        del doc["links"]
+        #print(parsed_link)
        print(json.dumps(doc))