This commit is contained in:
Daniel Hládek 2022-02-16 14:01:29 +01:00
parent a22fa87537
commit b5bab45113

30
websucker/dump.py Normal file
View File

@ -0,0 +1,30 @@
import cassandra
import cassandra.cluster
import cassandra.query
import json
import datetime
import sys
cassandra_host = sys.argv[1]
cassandra_port = sys.argv[2]
keyspace = "websucker"
ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0,row_factory=cassandra.query.dict_factory)
profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep}
cluster = cassandra.cluster.Cluster([cassandra_host],port=cassandra_port,execution_profiles=profiles)
select_documents = "select json * from content"
with cluster.connect(keyspace) as session:
#session.row_factory = cassandra.query.dict_factory
select_html = session.prepare("select json * from html where day=? and domain_name=? LIMIT 1")
rows = session.execute(select_documents)
for row in rows:
doc = json.loads(row["[json]"])
dt = doc["update_time"]
d = dt.split()[0]
hrows = session.execute(select_html,(d,doc["domain_name"]))
html = {}
for h in hrows:
html = json.loads(h["[json]"])
break
doc["html_data"] = html
print(json.dumps(doc))