from cassandra.cqlengine import columns from cassandra.cqlengine.models import Model class Links(Model): domain_name = column.Text(primary_key=True) url_path = column.Text(primary_key=True) url_query = column.Text(primary_key=True) url_schema = column.Text() redirect_target = column.Text() link_status = column.Text() link_originality = column.Float() body_size = column.Integer() update_time = column.DateTime() class DailyLinks(Model): day = column.Integer(primary_key=True) domain_name = column.Text(primary_key=True) url_path = column.Text(primary_key=True) url_query = column.Text(primary_key=True) url_schema = column.Text() redirect_target = column.Text() link_status = column.Text() link_originality = column.Float() body_size = column.Integer() update_time = column.DateTime() CREATE TABLE domain_quality ( domain_name TEXT, day DATE, seen_count INT, good_size INT, good_count INT, good_probability FLOAT, good_originality FLOAT, average_good_characters FLOAT, content_size INT, content_count INT, content_probability FLOAT, content_originality FLOAT, average_content_characters FLOAT, fetched_count INT, average_fetched_good_characters FLOAT, gain_ratio FLOAT, update_time TIMESTAMP STATIC , PRIMARY KEY(domain_name,day) ) WITH CLUSTERING ORDER BY (day DESC); CREATE TABLE content ( domain_name TEXT, target_link TEXT, agent_version TEXT, title TEXT, links SET, authors SET, tags SET, description TEXT, section TEXT, article_published_time TEXT, text_date TEXT, body TEXT, body_size INT, update_time TIMESTAMP, PRIMARY KEY(domain_name,target_link), ); CREATE TABLE paragraph_checksums ( checksum BIGINT, url_hash BIGINT, PRIMARY KEY(checksum,url_hash), ); CREATE TABLE html ( day DATE, domain_name TEXT, source_link TEXT, target_link TEXT, redirect_links LIST, status INT, content TEXT, headers TEXT, agent_version TEXT, update_time TIMESTAMP, PRIMARY KEY(day,domain_name,source_link) ); CREATE TABLE domain_connections ( domain_name TEXT, linked_domain TEXT, PRIMARY KEY (domain_name,linked_domain) );