DROP KEYSPACE websucker; CREATE KEYSPACE websucker WITH replication = {'class':'SimpleStrategy', 'replication_factor' : 1}; USE websucker; CREATE TABLE links ( domain_name TEXT, url_path TEXT, url_query TEXT, url_schema TEXT, redirect_target TEXT, link_status TEXT, link_originality FLOAT, body_size INT, update_time TIMESTAMP, PRIMARY KEY(domain_name,url_path,url_query) ); CREATE INDEX link_status_index ON links(link_status); CREATE TABLE daily_links ( day DATE, domain_name TEXT, link_status TEXT, url_path TEXT, url_query TEXT, body_size INT, link_originality FLOAT, update_time TIMESTAMP, PRIMARY KEY(day,domain_name,link_status,url_path,url_query) ); CREATE TABLE domain_quality ( domain_name TEXT, day DATE, seen_count INT, good_size INT, good_count INT, good_probability FLOAT, good_originality FLOAT, average_good_characters FLOAT, content_size INT, content_count INT, content_probability FLOAT, content_originality FLOAT, average_content_characters FLOAT, fetched_count INT, average_fetched_good_characters FLOAT, gain_ratio FLOAT, update_time TIMESTAMP STATIC , PRIMARY KEY(domain_name,day) ) WITH CLUSTERING ORDER BY (day DESC); CREATE TABLE content ( domain_name TEXT, target_link TEXT, agent_version TEXT, title TEXT, links SET, authors SET, tags SET, description TEXT, section TEXT, article_published_time TEXT, text_date TEXT, body TEXT, body_size INT, update_time TIMESTAMP, PRIMARY KEY(domain_name,target_link), ); CREATE TABLE paragraph_checksums ( checksum BIGINT, url_hash BIGINT, PRIMARY KEY(checksum,url_hash), ); CREATE TABLE html ( day DATE, domain_name TEXT, source_link TEXT, target_link TEXT, redirect_links LIST, status INT, content TEXT, headers TEXT, agent_version TEXT, update_time TIMESTAMP, PRIMARY KEY(day,domain_name,source_link) ); CREATE TABLE domain_connections ( domain_name TEXT, linked_domain TEXT, PRIMARY KEY (domain_name,linked_domain) );