102 lines
3.7 KiB
Python
102 lines
3.7 KiB
Python
from cassandra.cqlengine import columns
|
|
from cassandra.cqlengine.models import Model
|
|
from cassandra.cqlengine.management import sync_table
|
|
|
|
class Links(Model):
|
|
__table_name__ = "links"
|
|
domain_name = columns.Text(primary_key=True)
|
|
url_path = columns.Text(primary_key=True)
|
|
url_query = columns.Text(primary_key=True)
|
|
url_schema = columns.Text()
|
|
redirect_target = columns.Text()
|
|
link_status = columns.Text()
|
|
link_originality = columns.Float()
|
|
body_size = columns.Integer()
|
|
update_time = columns.DateTime()
|
|
|
|
|
|
class DailyLinks(Model):
|
|
__table_name__ = "daily_links"
|
|
day = columns.Integer(primary_key=True)
|
|
domain_name = columns.Text(primary_key=True)
|
|
url_path = columns.Text(primary_key=True)
|
|
url_query = columns.Text(primary_key=True)
|
|
url_schema = columns.Text()
|
|
redirect_target = columns.Text()
|
|
link_status = columns.Text()
|
|
link_originality = columns.Float()
|
|
body_size = columns.Integer()
|
|
update_time = columns.DateTime()
|
|
|
|
|
|
class DomainQuality:
|
|
__table_name__ = "domain_quality"
|
|
domain_name = columns.Text(primary_key=True)
|
|
day = columns.Date(primary_key=True)
|
|
seen_count = columns.Integer()
|
|
good_size = columns.Integer()
|
|
good_count = columns.Integer()
|
|
good_probability = columns.Float()
|
|
good_originality = columns.Float()
|
|
average_good_characters = columns.Float()
|
|
content_size = columns.Integer()
|
|
content_count = columns.Integer()
|
|
content_probability = columns.Float()
|
|
content_originality = columns.Float()
|
|
average_content_characters = columns.Float()
|
|
fetched_count = columns.Integer()
|
|
average_fetched_good_characters = columns.Float()
|
|
gain_ratio = columns.Float()
|
|
update_time = columns.TimeUUID(static=True) #TIMESTAMP STATIC ,
|
|
#) WITH CLUSTERING ORDER BY (day DESC);
|
|
|
|
class Content(Model):
|
|
__table_name__ = "content"
|
|
domain_name = columns.Text(primary_key=True)
|
|
target_link = columns.Text(primary_key=True)
|
|
agent_version = columns.Text()
|
|
title = columns.Text()
|
|
links = columns.Set(value_type=columns.Text)
|
|
authors = columns.Set(value_type=columns.Text)
|
|
tags = columns.Set(value_type=columns.Text)
|
|
description = columns.Text()
|
|
section = columns.Text()
|
|
article_published_time = columns.Text()
|
|
text_date = columns.Text()
|
|
body = columns.Text()
|
|
body_size = columns.Text()
|
|
update_time = columns.DateTime()
|
|
# PRIMARY KEY(domain_name,target_link),
|
|
|
|
class ParagraphChecksums(Model):
|
|
__table_name__ = "paragraph_checksums"
|
|
checksum = columns.BigInt(primary_key=True)
|
|
url_hash = columns.BigInt(primary_key=True)
|
|
|
|
class Html(Model):
|
|
__table_name__ = "html"
|
|
day = columns.Date(primary_key=True)
|
|
domain_name = columns.Text(primary_key=True)
|
|
source_link = columns.Text(primary_key=True)
|
|
target_link = columns.Text()
|
|
redirect_links = columns.List(value_type=columns.Text)
|
|
status = columns.Integer()
|
|
content = columns.Text()
|
|
headers = columns.Text()
|
|
agent_version = columns.Text()
|
|
update_time = columns.Text()
|
|
|
|
class DomainConnections(Model):
|
|
__table_name__ = "domain_connections"
|
|
domain_name = columns.Text(primary_key=True)
|
|
linked_domain = columns.Text(primary_key=True)
|
|
|
|
def create_database(keyspace,session):
|
|
sync_table(Links,keyspaces=[keyspace],connections=[session])
|
|
sync_table(DailyLinks,keyspaces=[keyspace],connections=[session])
|
|
sync_table(DomainQuality,keyspaces=[keyspace],connections=[session])
|
|
sync_table(Content,keyspaces=[keyspace],connections=[session])
|
|
sync_table(ParagraphChecksums,keyspaces=[keyspace],connections=[session])
|
|
sync_table(Html,keyspaces=[keyspace],connections=[session])
|
|
sync_table(DomainConnections,keyspaces=[keyspace],connections=[session])
|