Compare commits
No commits in common. "d37bafb666f1e2984304e52afd9dc85cc6553896" and "b5bab45113c7c460fc86093b3aa77a6a55d2e3ae" have entirely different histories.
d37bafb666
...
b5bab45113
@ -4,7 +4,6 @@ import cassandra.query
|
|||||||
import json
|
import json
|
||||||
import datetime
|
import datetime
|
||||||
import sys
|
import sys
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
cassandra_host = sys.argv[1]
|
cassandra_host = sys.argv[1]
|
||||||
cassandra_port = sys.argv[2]
|
cassandra_port = sys.argv[2]
|
||||||
@ -17,30 +16,9 @@ select_documents = "select json * from content"
|
|||||||
with cluster.connect(keyspace) as session:
|
with cluster.connect(keyspace) as session:
|
||||||
#session.row_factory = cassandra.query.dict_factory
|
#session.row_factory = cassandra.query.dict_factory
|
||||||
select_html = session.prepare("select json * from html where day=? and domain_name=? LIMIT 1")
|
select_html = session.prepare("select json * from html where day=? and domain_name=? LIMIT 1")
|
||||||
select_link = session.prepare("select link_status from links where domain_name=? and url_path=? and url_query=? LIMIT 1")
|
|
||||||
rows = session.execute(select_documents)
|
rows = session.execute(select_documents)
|
||||||
for row in rows:
|
for row in rows:
|
||||||
doc = json.loads(row["[json]"])
|
doc = json.loads(row["[json]"])
|
||||||
target_link = doc["target_link"]
|
|
||||||
parsed_link = urlparse(target_link)
|
|
||||||
netloc = parsed_link[1].strip().lower()
|
|
||||||
path = parsed_link[2].strip()
|
|
||||||
# strip leading /
|
|
||||||
if len(path) > 1 and path[0] == "/":
|
|
||||||
path = path[1:]
|
|
||||||
query = parsed_link[4]
|
|
||||||
lrows = session.execute(select_link,(netloc,path,query))
|
|
||||||
status = None
|
|
||||||
for l in lrows:
|
|
||||||
status = str(l["link_status"])
|
|
||||||
break
|
|
||||||
#assert status is not None
|
|
||||||
if status is None:
|
|
||||||
continue
|
|
||||||
#print(status)
|
|
||||||
# skip bad links
|
|
||||||
if not status == "good":
|
|
||||||
continue
|
|
||||||
dt = doc["update_time"]
|
dt = doc["update_time"]
|
||||||
d = dt.split()[0]
|
d = dt.split()[0]
|
||||||
hrows = session.execute(select_html,(d,doc["domain_name"]))
|
hrows = session.execute(select_html,(d,doc["domain_name"]))
|
||||||
@ -49,6 +27,4 @@ with cluster.connect(keyspace) as session:
|
|||||||
html = json.loads(h["[json]"])
|
html = json.loads(h["[json]"])
|
||||||
break
|
break
|
||||||
doc["html_data"] = html
|
doc["html_data"] = html
|
||||||
del doc["links"]
|
|
||||||
#print(parsed_link)
|
|
||||||
print(json.dumps(doc))
|
print(json.dumps(doc))
|
||||||
|
Loading…
Reference in New Issue
Block a user