diff --git a/websucker/dump.py b/websucker/dump.py new file mode 100644 index 0000000..b62bd4a --- /dev/null +++ b/websucker/dump.py @@ -0,0 +1,30 @@ +import cassandra +import cassandra.cluster +import cassandra.query +import json +import datetime +import sys + +cassandra_host = sys.argv[1] +cassandra_port = sys.argv[2] +keyspace = "websucker" + +ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0,row_factory=cassandra.query.dict_factory) +profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep} +cluster = cassandra.cluster.Cluster([cassandra_host],port=cassandra_port,execution_profiles=profiles) +select_documents = "select json * from content" +with cluster.connect(keyspace) as session: + #session.row_factory = cassandra.query.dict_factory + select_html = session.prepare("select json * from html where day=? and domain_name=? LIMIT 1") + rows = session.execute(select_documents) + for row in rows: + doc = json.loads(row["[json]"]) + dt = doc["update_time"] + d = dt.split()[0] + hrows = session.execute(select_html,(d,doc["domain_name"])) + html = {} + for h in hrows: + html = json.loads(h["[json]"]) + break + doc["html_data"] = html + print(json.dumps(doc))