websucker-pip/websucker/schema.py

94 lines
2.3 KiB
Python
Raw Normal View History

2023-02-23 15:08:14 +00:00
from cassandra.cqlengine import columns
from cassandra.cqlengine.models import Model
class Links(Model):
domain_name = column.Text(primary_key=True)
url_path = column.Text(primary_key=True)
url_query = column.Text(primary_key=True)
url_schema = column.Text()
redirect_target = column.Text()
link_status = column.Text()
link_originality = column.Float()
body_size = column.Integer()
update_time = column.DateTime()
class DailyLinks(Model):
day = column.Integer(primary_key=True)
domain_name = column.Text(primary_key=True)
url_path = column.Text(primary_key=True)
url_query = column.Text(primary_key=True)
url_schema = column.Text()
redirect_target = column.Text()
link_status = column.Text()
link_originality = column.Float()
body_size = column.Integer()
update_time = column.DateTime()
CREATE TABLE domain_quality (
domain_name TEXT,
day DATE,
seen_count INT,
good_size INT,
good_count INT,
good_probability FLOAT,
good_originality FLOAT,
average_good_characters FLOAT,
content_size INT,
content_count INT,
content_probability FLOAT,
content_originality FLOAT,
average_content_characters FLOAT,
fetched_count INT,
average_fetched_good_characters FLOAT,
gain_ratio FLOAT,
update_time TIMESTAMP STATIC ,
PRIMARY KEY(domain_name,day)
) WITH CLUSTERING ORDER BY (day DESC);
CREATE TABLE content (
domain_name TEXT,
target_link TEXT,
agent_version TEXT,
title TEXT,
links SET<TEXT>,
authors SET<TEXT>,
tags SET<TEXT>,
description TEXT,
section TEXT,
article_published_time TEXT,
text_date TEXT,
body TEXT,
body_size INT,
update_time TIMESTAMP,
PRIMARY KEY(domain_name,target_link),
);
CREATE TABLE paragraph_checksums (
checksum BIGINT,
url_hash BIGINT,
PRIMARY KEY(checksum,url_hash),
);
CREATE TABLE html (
day DATE,
domain_name TEXT,
source_link TEXT,
target_link TEXT,
redirect_links LIST<TEXT>,
status INT,
content TEXT,
headers TEXT,
agent_version TEXT,
update_time TIMESTAMP,
PRIMARY KEY(day,domain_name,source_link)
);
CREATE TABLE domain_connections (
domain_name TEXT,
linked_domain TEXT,
PRIMARY KEY (domain_name,linked_domain)
);