From 1e7ac17d903a136674146a0f9a2ea94cca93d2d4 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Thu, 23 Feb 2023 16:08:14 +0100 Subject: [PATCH] wip schema --- websucker/schema.py | 93 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 websucker/schema.py diff --git a/websucker/schema.py b/websucker/schema.py new file mode 100644 index 0000000..ac14e5a --- /dev/null +++ b/websucker/schema.py @@ -0,0 +1,93 @@ +from cassandra.cqlengine import columns +from cassandra.cqlengine.models import Model + +class Links(Model): + domain_name = column.Text(primary_key=True) + url_path = column.Text(primary_key=True) + url_query = column.Text(primary_key=True) + url_schema = column.Text() + redirect_target = column.Text() + link_status = column.Text() + link_originality = column.Float() + body_size = column.Integer() + update_time = column.DateTime() + + +class DailyLinks(Model): + day = column.Integer(primary_key=True) + domain_name = column.Text(primary_key=True) + url_path = column.Text(primary_key=True) + url_query = column.Text(primary_key=True) + url_schema = column.Text() + redirect_target = column.Text() + link_status = column.Text() + link_originality = column.Float() + body_size = column.Integer() + update_time = column.DateTime() + + +CREATE TABLE domain_quality ( + domain_name TEXT, + day DATE, + seen_count INT, + good_size INT, + good_count INT, + good_probability FLOAT, + good_originality FLOAT, + average_good_characters FLOAT, + content_size INT, + content_count INT, + content_probability FLOAT, + content_originality FLOAT, + average_content_characters FLOAT, + fetched_count INT, + average_fetched_good_characters FLOAT, + gain_ratio FLOAT, + update_time TIMESTAMP STATIC , + PRIMARY KEY(domain_name,day) +) WITH CLUSTERING ORDER BY (day DESC); + + +CREATE TABLE content ( + domain_name TEXT, + target_link TEXT, + agent_version TEXT, + title TEXT, + links SET, + authors SET, + tags SET, + description TEXT, + section TEXT, + article_published_time TEXT, + text_date TEXT, + body TEXT, + body_size INT, + update_time TIMESTAMP, + PRIMARY KEY(domain_name,target_link), +); + +CREATE TABLE paragraph_checksums ( + checksum BIGINT, + url_hash BIGINT, + PRIMARY KEY(checksum,url_hash), +); + +CREATE TABLE html ( + day DATE, + domain_name TEXT, + source_link TEXT, + target_link TEXT, + redirect_links LIST, + status INT, + content TEXT, + headers TEXT, + agent_version TEXT, + update_time TIMESTAMP, + PRIMARY KEY(day,domain_name,source_link) +); + +CREATE TABLE domain_connections ( + domain_name TEXT, + linked_domain TEXT, + PRIMARY KEY (domain_name,linked_domain) +);