from cassandra.cqlengine import columns from cassandra.cqlengine.models import Model from cassandra.cqlengine.management import sync_table class Links(Model): __table_name__ = "links" domain_name = columns.Text(primary_key=True) url_path = columns.Text(primary_key=True) url_query = columns.Text(primary_key=True) url_schema = columns.Text() redirect_target = columns.Text() link_status = columns.Text(index=True) link_originality = columns.Float() body_size = columns.Integer() update_time = columns.DateTime() class DailyLinks(Model): __table_name__ = "daily_links" day = columns.Date(primary_key=True) domain_name = columns.Text(primary_key=True) link_status = columns.Text(primary_key=True) url_path = columns.Text(primary_key=True) url_query = columns.Text(primary_key=True) link_originality = columns.Float() body_size = columns.Integer() update_time = columns.DateTime() class DomainQuality(Model): __table_name__ = "domain_quality" domain_name = columns.Text(primary_key=True) day = columns.Date(primary_key=True) seen_count = columns.Integer() good_size = columns.Integer() good_count = columns.Integer() good_probability = columns.Float() good_originality = columns.Float() average_good_characters = columns.Float() content_size = columns.Integer() content_count = columns.Integer() content_probability = columns.Float() content_originality = columns.Float() average_content_characters = columns.Float() fetched_count = columns.Integer() average_fetched_good_characters = columns.Float() gain_ratio = columns.Float() update_time = columns.DateTime(static=True) #TIMESTAMP STATIC , #) WITH CLUSTERING ORDER BY (day DESC); class Content(Model): __table_name__ = "content" domain_name = columns.Text(primary_key=True) target_link = columns.Text(primary_key=True) agent_version = columns.Text() title = columns.Text() links = columns.Set(value_type=columns.Text) authors = columns.Set(value_type=columns.Text) tags = columns.Set(value_type=columns.Text) description = columns.Text() section = columns.Text() article_published_time = columns.Text() text_date = columns.Text() body = columns.Text() body_size = columns.Text() update_time = columns.DateTime() # PRIMARY KEY(domain_name,target_link), class ParagraphChecksums(Model): __table_name__ = "paragraph_checksums" checksum = columns.BigInt(primary_key=True) url_hash = columns.BigInt(primary_key=True) class Html(Model): __table_name__ = "html" day = columns.Date(primary_key=True) domain_name = columns.Text(primary_key=True) source_link = columns.Text(primary_key=True) target_link = columns.Text() redirect_links = columns.List(value_type=columns.Text) status = columns.Integer() content = columns.Text() headers = columns.Text() agent_version = columns.Text() update_time = columns.DateTime() class DomainConnections(Model): __table_name__ = "domain_connections" domain_name = columns.Text(primary_key=True) linked_domain = columns.Text(primary_key=True) def create_database(keyspace,session): sync_table(Links) sync_table(DailyLinks) sync_table(DomainQuality) sync_table(Content) sync_table(ParagraphChecksums) sync_table(Html) sync_table(DomainConnections)