diff --git a/websucker/dump.py b/websucker/dump.py index b62bd4a..d9be4da 100644 --- a/websucker/dump.py +++ b/websucker/dump.py @@ -4,6 +4,7 @@ import cassandra.query import json import datetime import sys +from urllib.parse import urlparse cassandra_host = sys.argv[1] cassandra_port = sys.argv[2] @@ -16,9 +17,30 @@ select_documents = "select json * from content" with cluster.connect(keyspace) as session: #session.row_factory = cassandra.query.dict_factory select_html = session.prepare("select json * from html where day=? and domain_name=? LIMIT 1") + select_link = session.prepare("select link_status from links where domain_name=? and url_path=? and url_query=? LIMIT 1") rows = session.execute(select_documents) for row in rows: doc = json.loads(row["[json]"]) + target_link = doc["target_link"] + parsed_link = urlparse(target_link) + netloc = parsed_link[1].strip().lower() + path = parsed_link[2].strip() + # strip leading / + if len(path) > 1 and path[0] == "/": + path = path[1:] + query = parsed_link[4] + lrows = session.execute(select_link,(netloc,path,query)) + status = None + for l in lrows: + status = str(l["link_status"]) + break + #assert status is not None + if status is None: + continue + #print(status) + # skip bad links + if not status == "good": + continue dt = doc["update_time"] d = dt.split()[0] hrows = session.execute(select_html,(d,doc["domain_name"])) @@ -27,4 +49,6 @@ with cluster.connect(keyspace) as session: html = json.loads(h["[json]"]) break doc["html_data"] = html + del doc["links"] + #print(parsed_link) print(json.dumps(doc))