zz
This commit is contained in:
parent
a22fa87537
commit
b5bab45113
30
websucker/dump.py
Normal file
30
websucker/dump.py
Normal file
@ -0,0 +1,30 @@
|
||||
import cassandra
|
||||
import cassandra.cluster
|
||||
import cassandra.query
|
||||
import json
|
||||
import datetime
|
||||
import sys
|
||||
|
||||
cassandra_host = sys.argv[1]
|
||||
cassandra_port = sys.argv[2]
|
||||
keyspace = "websucker"
|
||||
|
||||
ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0,row_factory=cassandra.query.dict_factory)
|
||||
profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep}
|
||||
cluster = cassandra.cluster.Cluster([cassandra_host],port=cassandra_port,execution_profiles=profiles)
|
||||
select_documents = "select json * from content"
|
||||
with cluster.connect(keyspace) as session:
|
||||
#session.row_factory = cassandra.query.dict_factory
|
||||
select_html = session.prepare("select json * from html where day=? and domain_name=? LIMIT 1")
|
||||
rows = session.execute(select_documents)
|
||||
for row in rows:
|
||||
doc = json.loads(row["[json]"])
|
||||
dt = doc["update_time"]
|
||||
d = dt.split()[0]
|
||||
hrows = session.execute(select_html,(d,doc["domain_name"]))
|
||||
html = {}
|
||||
for h in hrows:
|
||||
html = json.loads(h["[json]"])
|
||||
break
|
||||
doc["html_data"] = html
|
||||
print(json.dumps(doc))
|
Loading…
Reference in New Issue
Block a user