This commit is contained in:
Daniel Hládek 2022-02-18 13:58:05 +01:00
parent b75336c8a6
commit d37bafb666

View File

@ -4,6 +4,7 @@ import cassandra.query
import json
import datetime
import sys
from urllib.parse import urlparse
cassandra_host = sys.argv[1]
cassandra_port = sys.argv[2]
@ -16,9 +17,30 @@ select_documents = "select json * from content"
with cluster.connect(keyspace) as session:
#session.row_factory = cassandra.query.dict_factory
select_html = session.prepare("select json * from html where day=? and domain_name=? LIMIT 1")
select_link = session.prepare("select link_status from links where domain_name=? and url_path=? and url_query=? LIMIT 1")
rows = session.execute(select_documents)
for row in rows:
doc = json.loads(row["[json]"])
target_link = doc["target_link"]
parsed_link = urlparse(target_link)
netloc = parsed_link[1].strip().lower()
path = parsed_link[2].strip()
# strip leading /
if len(path) > 1 and path[0] == "/":
path = path[1:]
query = parsed_link[4]
lrows = session.execute(select_link,(netloc,path,query))
status = None
for l in lrows:
status = str(l["link_status"])
break
#assert status is not None
if status is None:
continue
#print(status)
# skip bad links
if not status == "good":
continue
dt = doc["update_time"]
d = dt.split()[0]
hrows = session.execute(select_html,(d,doc["domain_name"]))
@ -27,4 +49,6 @@ with cluster.connect(keyspace) as session:
html = json.loads(h["[json]"])
break
doc["html_data"] = html
del doc["links"]
#print(parsed_link)
print(json.dumps(doc))