zz
This commit is contained in:
		
							parent
							
								
									b75336c8a6
								
							
						
					
					
						commit
						d37bafb666
					
				@ -4,6 +4,7 @@ import cassandra.query
 | 
			
		||||
import json
 | 
			
		||||
import datetime
 | 
			
		||||
import sys
 | 
			
		||||
from urllib.parse import urlparse
 | 
			
		||||
 | 
			
		||||
cassandra_host = sys.argv[1]
 | 
			
		||||
cassandra_port = sys.argv[2]
 | 
			
		||||
@ -16,9 +17,30 @@ select_documents = "select json * from content"
 | 
			
		||||
with cluster.connect(keyspace) as session:
 | 
			
		||||
    #session.row_factory = cassandra.query.dict_factory
 | 
			
		||||
    select_html = session.prepare("select json * from html where day=? and domain_name=? LIMIT 1")
 | 
			
		||||
    select_link = session.prepare("select link_status from links where domain_name=? and url_path=? and url_query=? LIMIT 1")
 | 
			
		||||
    rows = session.execute(select_documents)
 | 
			
		||||
    for row in rows:
 | 
			
		||||
        doc = json.loads(row["[json]"])
 | 
			
		||||
        target_link = doc["target_link"]
 | 
			
		||||
        parsed_link = urlparse(target_link)
 | 
			
		||||
        netloc = parsed_link[1].strip().lower()
 | 
			
		||||
        path = parsed_link[2].strip()
 | 
			
		||||
        # strip leading /
 | 
			
		||||
        if len(path) > 1 and path[0] == "/":
 | 
			
		||||
            path = path[1:]
 | 
			
		||||
        query = parsed_link[4]
 | 
			
		||||
        lrows = session.execute(select_link,(netloc,path,query))
 | 
			
		||||
        status = None
 | 
			
		||||
        for l in lrows:
 | 
			
		||||
            status = str(l["link_status"])
 | 
			
		||||
            break
 | 
			
		||||
        #assert status is not None
 | 
			
		||||
        if status is None:
 | 
			
		||||
            continue
 | 
			
		||||
        #print(status)
 | 
			
		||||
        # skip bad links
 | 
			
		||||
        if not status == "good":
 | 
			
		||||
            continue
 | 
			
		||||
        dt = doc["update_time"]
 | 
			
		||||
        d = dt.split()[0]
 | 
			
		||||
        hrows = session.execute(select_html,(d,doc["domain_name"]))
 | 
			
		||||
@ -27,4 +49,6 @@ with cluster.connect(keyspace) as session:
 | 
			
		||||
            html = json.loads(h["[json]"])
 | 
			
		||||
            break
 | 
			
		||||
        doc["html_data"] = html
 | 
			
		||||
        del doc["links"]
 | 
			
		||||
        #print(parsed_link)
 | 
			
		||||
        print(json.dumps(doc))
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user