2023-02-23 15:08:14 +00:00
|
|
|
from cassandra.cqlengine import columns
|
|
|
|
from cassandra.cqlengine.models import Model
|
2023-02-28 07:56:35 +00:00
|
|
|
from cassandra.cqlengine.management import sync_table
|
2023-02-23 15:08:14 +00:00
|
|
|
|
|
|
|
class Links(Model):
|
2023-02-28 07:56:35 +00:00
|
|
|
__table_name__ = "links"
|
|
|
|
domain_name = columns.Text(primary_key=True)
|
|
|
|
url_path = columns.Text(primary_key=True)
|
|
|
|
url_query = columns.Text(primary_key=True)
|
|
|
|
url_schema = columns.Text()
|
|
|
|
redirect_target = columns.Text()
|
2023-02-28 12:57:13 +00:00
|
|
|
link_status = columns.Text(index=True)
|
2023-02-28 07:56:35 +00:00
|
|
|
link_originality = columns.Float()
|
|
|
|
body_size = columns.Integer()
|
|
|
|
update_time = columns.DateTime()
|
2023-02-23 15:08:14 +00:00
|
|
|
|
|
|
|
|
|
|
|
class DailyLinks(Model):
|
2023-02-28 07:56:35 +00:00
|
|
|
__table_name__ = "daily_links"
|
2023-02-28 12:57:13 +00:00
|
|
|
day = columns.Date(primary_key=True)
|
2023-02-28 07:56:35 +00:00
|
|
|
domain_name = columns.Text(primary_key=True)
|
2023-02-28 12:57:13 +00:00
|
|
|
link_status = columns.Text(primary_key=True)
|
2023-02-28 07:56:35 +00:00
|
|
|
url_path = columns.Text(primary_key=True)
|
|
|
|
url_query = columns.Text(primary_key=True)
|
|
|
|
link_originality = columns.Float()
|
|
|
|
body_size = columns.Integer()
|
|
|
|
update_time = columns.DateTime()
|
2023-02-23 15:08:14 +00:00
|
|
|
|
|
|
|
|
2023-02-28 12:57:13 +00:00
|
|
|
class DomainQuality(Model):
|
2023-02-28 07:56:35 +00:00
|
|
|
__table_name__ = "domain_quality"
|
|
|
|
domain_name = columns.Text(primary_key=True)
|
|
|
|
day = columns.Date(primary_key=True)
|
|
|
|
seen_count = columns.Integer()
|
|
|
|
good_size = columns.Integer()
|
|
|
|
good_count = columns.Integer()
|
|
|
|
good_probability = columns.Float()
|
|
|
|
good_originality = columns.Float()
|
|
|
|
average_good_characters = columns.Float()
|
|
|
|
content_size = columns.Integer()
|
|
|
|
content_count = columns.Integer()
|
|
|
|
content_probability = columns.Float()
|
|
|
|
content_originality = columns.Float()
|
|
|
|
average_content_characters = columns.Float()
|
|
|
|
fetched_count = columns.Integer()
|
|
|
|
average_fetched_good_characters = columns.Float()
|
|
|
|
gain_ratio = columns.Float()
|
2023-02-28 12:57:13 +00:00
|
|
|
update_time = columns.DateTime(static=True) #TIMESTAMP STATIC ,
|
2023-02-28 07:56:35 +00:00
|
|
|
#) WITH CLUSTERING ORDER BY (day DESC);
|
2023-02-23 15:08:14 +00:00
|
|
|
|
2023-02-28 07:56:35 +00:00
|
|
|
class Content(Model):
|
|
|
|
__table_name__ = "content"
|
|
|
|
domain_name = columns.Text(primary_key=True)
|
|
|
|
target_link = columns.Text(primary_key=True)
|
|
|
|
agent_version = columns.Text()
|
|
|
|
title = columns.Text()
|
|
|
|
links = columns.Set(value_type=columns.Text)
|
|
|
|
authors = columns.Set(value_type=columns.Text)
|
|
|
|
tags = columns.Set(value_type=columns.Text)
|
|
|
|
description = columns.Text()
|
|
|
|
section = columns.Text()
|
|
|
|
article_published_time = columns.Text()
|
|
|
|
text_date = columns.Text()
|
|
|
|
body = columns.Text()
|
2023-03-06 15:29:30 +00:00
|
|
|
body_size = columns.Integer()
|
2023-02-28 07:56:35 +00:00
|
|
|
update_time = columns.DateTime()
|
|
|
|
# PRIMARY KEY(domain_name,target_link),
|
2023-02-23 15:08:14 +00:00
|
|
|
|
2023-02-28 07:56:35 +00:00
|
|
|
class ParagraphChecksums(Model):
|
|
|
|
__table_name__ = "paragraph_checksums"
|
|
|
|
checksum = columns.BigInt(primary_key=True)
|
|
|
|
url_hash = columns.BigInt(primary_key=True)
|
2023-02-23 15:08:14 +00:00
|
|
|
|
2023-02-28 07:56:35 +00:00
|
|
|
class Html(Model):
|
|
|
|
__table_name__ = "html"
|
|
|
|
day = columns.Date(primary_key=True)
|
|
|
|
domain_name = columns.Text(primary_key=True)
|
|
|
|
source_link = columns.Text(primary_key=True)
|
|
|
|
target_link = columns.Text()
|
|
|
|
redirect_links = columns.List(value_type=columns.Text)
|
|
|
|
status = columns.Integer()
|
|
|
|
content = columns.Text()
|
|
|
|
headers = columns.Text()
|
|
|
|
agent_version = columns.Text()
|
2023-02-28 12:57:13 +00:00
|
|
|
update_time = columns.DateTime()
|
2023-02-23 15:08:14 +00:00
|
|
|
|
2023-02-28 07:56:35 +00:00
|
|
|
class DomainConnections(Model):
|
|
|
|
__table_name__ = "domain_connections"
|
|
|
|
domain_name = columns.Text(primary_key=True)
|
|
|
|
linked_domain = columns.Text(primary_key=True)
|
2023-02-23 15:08:14 +00:00
|
|
|
|
2023-02-28 11:55:44 +00:00
|
|
|
def create_database(keyspace,session):
|
2023-02-28 12:57:13 +00:00
|
|
|
sync_table(Links)
|
|
|
|
sync_table(DailyLinks)
|
|
|
|
sync_table(DomainQuality)
|
|
|
|
sync_table(Content)
|
|
|
|
sync_table(ParagraphChecksums)
|
|
|
|
sync_table(Html)
|
|
|
|
sync_table(DomainConnections)
|