websucker-pip/websucker/schema.py

100 lines
3.4 KiB
Python
Raw Normal View History

2023-02-23 15:08:14 +00:00
from cassandra.cqlengine import columns
from cassandra.cqlengine.models import Model
2023-02-28 07:56:35 +00:00
from cassandra.cqlengine.management import sync_table
2023-02-23 15:08:14 +00:00
class Links(Model):
2023-02-28 07:56:35 +00:00
__table_name__ = "links"
domain_name = columns.Text(primary_key=True)
url_path = columns.Text(primary_key=True)
url_query = columns.Text(primary_key=True)
url_schema = columns.Text()
redirect_target = columns.Text()
2023-02-28 12:57:13 +00:00
link_status = columns.Text(index=True)
2023-02-28 07:56:35 +00:00
link_originality = columns.Float()
body_size = columns.Integer()
update_time = columns.DateTime()
2023-02-23 15:08:14 +00:00
class DailyLinks(Model):
2023-02-28 07:56:35 +00:00
__table_name__ = "daily_links"
2023-02-28 12:57:13 +00:00
day = columns.Date(primary_key=True)
2023-02-28 07:56:35 +00:00
domain_name = columns.Text(primary_key=True)
2023-02-28 12:57:13 +00:00
link_status = columns.Text(primary_key=True)
2023-02-28 07:56:35 +00:00
url_path = columns.Text(primary_key=True)
url_query = columns.Text(primary_key=True)
link_originality = columns.Float()
body_size = columns.Integer()
update_time = columns.DateTime()
2023-02-23 15:08:14 +00:00
2023-02-28 12:57:13 +00:00
class DomainQuality(Model):
2023-02-28 07:56:35 +00:00
__table_name__ = "domain_quality"
domain_name = columns.Text(primary_key=True)
day = columns.Date(primary_key=True)
seen_count = columns.Integer()
good_size = columns.Integer()
good_count = columns.Integer()
good_probability = columns.Float()
good_originality = columns.Float()
average_good_characters = columns.Float()
content_size = columns.Integer()
content_count = columns.Integer()
content_probability = columns.Float()
content_originality = columns.Float()
average_content_characters = columns.Float()
fetched_count = columns.Integer()
average_fetched_good_characters = columns.Float()
gain_ratio = columns.Float()
2023-02-28 12:57:13 +00:00
update_time = columns.DateTime(static=True) #TIMESTAMP STATIC ,
2023-02-28 07:56:35 +00:00
#) WITH CLUSTERING ORDER BY (day DESC);
2023-02-23 15:08:14 +00:00
2023-02-28 07:56:35 +00:00
class Content(Model):
__table_name__ = "content"
domain_name = columns.Text(primary_key=True)
target_link = columns.Text(primary_key=True)
agent_version = columns.Text()
title = columns.Text()
links = columns.Set(value_type=columns.Text)
authors = columns.Set(value_type=columns.Text)
tags = columns.Set(value_type=columns.Text)
description = columns.Text()
section = columns.Text()
article_published_time = columns.Text()
text_date = columns.Text()
body = columns.Text()
2023-03-06 15:29:30 +00:00
body_size = columns.Integer()
2023-02-28 07:56:35 +00:00
update_time = columns.DateTime()
# PRIMARY KEY(domain_name,target_link),
2023-02-23 15:08:14 +00:00
2023-02-28 07:56:35 +00:00
class ParagraphChecksums(Model):
__table_name__ = "paragraph_checksums"
checksum = columns.BigInt(primary_key=True)
url_hash = columns.BigInt(primary_key=True)
2023-02-23 15:08:14 +00:00
2023-02-28 07:56:35 +00:00
class Html(Model):
__table_name__ = "html"
day = columns.Date(primary_key=True)
domain_name = columns.Text(primary_key=True)
source_link = columns.Text(primary_key=True)
target_link = columns.Text()
redirect_links = columns.List(value_type=columns.Text)
status = columns.Integer()
content = columns.Text()
headers = columns.Text()
agent_version = columns.Text()
2023-02-28 12:57:13 +00:00
update_time = columns.DateTime()
2023-02-23 15:08:14 +00:00
2023-02-28 07:56:35 +00:00
class DomainConnections(Model):
__table_name__ = "domain_connections"
domain_name = columns.Text(primary_key=True)
linked_domain = columns.Text(primary_key=True)
2023-02-23 15:08:14 +00:00
2023-02-28 11:55:44 +00:00
def create_database(keyspace,session):
2023-02-28 12:57:13 +00:00
sync_table(Links)
sync_table(DailyLinks)
sync_table(DomainQuality)
sync_table(Content)
sync_table(ParagraphChecksums)
sync_table(Html)
sync_table(DomainConnections)