zz
This commit is contained in:
		
							parent
							
								
									b75336c8a6
								
							
						
					
					
						commit
						d37bafb666
					
				@ -4,6 +4,7 @@ import cassandra.query
 | 
				
			|||||||
import json
 | 
					import json
 | 
				
			||||||
import datetime
 | 
					import datetime
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
 | 
					from urllib.parse import urlparse
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cassandra_host = sys.argv[1]
 | 
					cassandra_host = sys.argv[1]
 | 
				
			||||||
cassandra_port = sys.argv[2]
 | 
					cassandra_port = sys.argv[2]
 | 
				
			||||||
@ -16,9 +17,30 @@ select_documents = "select json * from content"
 | 
				
			|||||||
with cluster.connect(keyspace) as session:
 | 
					with cluster.connect(keyspace) as session:
 | 
				
			||||||
    #session.row_factory = cassandra.query.dict_factory
 | 
					    #session.row_factory = cassandra.query.dict_factory
 | 
				
			||||||
    select_html = session.prepare("select json * from html where day=? and domain_name=? LIMIT 1")
 | 
					    select_html = session.prepare("select json * from html where day=? and domain_name=? LIMIT 1")
 | 
				
			||||||
 | 
					    select_link = session.prepare("select link_status from links where domain_name=? and url_path=? and url_query=? LIMIT 1")
 | 
				
			||||||
    rows = session.execute(select_documents)
 | 
					    rows = session.execute(select_documents)
 | 
				
			||||||
    for row in rows:
 | 
					    for row in rows:
 | 
				
			||||||
        doc = json.loads(row["[json]"])
 | 
					        doc = json.loads(row["[json]"])
 | 
				
			||||||
 | 
					        target_link = doc["target_link"]
 | 
				
			||||||
 | 
					        parsed_link = urlparse(target_link)
 | 
				
			||||||
 | 
					        netloc = parsed_link[1].strip().lower()
 | 
				
			||||||
 | 
					        path = parsed_link[2].strip()
 | 
				
			||||||
 | 
					        # strip leading /
 | 
				
			||||||
 | 
					        if len(path) > 1 and path[0] == "/":
 | 
				
			||||||
 | 
					            path = path[1:]
 | 
				
			||||||
 | 
					        query = parsed_link[4]
 | 
				
			||||||
 | 
					        lrows = session.execute(select_link,(netloc,path,query))
 | 
				
			||||||
 | 
					        status = None
 | 
				
			||||||
 | 
					        for l in lrows:
 | 
				
			||||||
 | 
					            status = str(l["link_status"])
 | 
				
			||||||
 | 
					            break
 | 
				
			||||||
 | 
					        #assert status is not None
 | 
				
			||||||
 | 
					        if status is None:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        #print(status)
 | 
				
			||||||
 | 
					        # skip bad links
 | 
				
			||||||
 | 
					        if not status == "good":
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
        dt = doc["update_time"]
 | 
					        dt = doc["update_time"]
 | 
				
			||||||
        d = dt.split()[0]
 | 
					        d = dt.split()[0]
 | 
				
			||||||
        hrows = session.execute(select_html,(d,doc["domain_name"]))
 | 
					        hrows = session.execute(select_html,(d,doc["domain_name"]))
 | 
				
			||||||
@ -27,4 +49,6 @@ with cluster.connect(keyspace) as session:
 | 
				
			|||||||
            html = json.loads(h["[json]"])
 | 
					            html = json.loads(h["[json]"])
 | 
				
			||||||
            break
 | 
					            break
 | 
				
			||||||
        doc["html_data"] = html
 | 
					        doc["html_data"] = html
 | 
				
			||||||
 | 
					        del doc["links"]
 | 
				
			||||||
 | 
					        #print(parsed_link)
 | 
				
			||||||
        print(json.dumps(doc))
 | 
					        print(json.dumps(doc))
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user