wip schema
This commit is contained in:
parent
56bf59e5e9
commit
1e7ac17d90
93
websucker/schema.py
Normal file
93
websucker/schema.py
Normal file
@ -0,0 +1,93 @@
|
||||
from cassandra.cqlengine import columns
|
||||
from cassandra.cqlengine.models import Model
|
||||
|
||||
class Links(Model):
|
||||
domain_name = column.Text(primary_key=True)
|
||||
url_path = column.Text(primary_key=True)
|
||||
url_query = column.Text(primary_key=True)
|
||||
url_schema = column.Text()
|
||||
redirect_target = column.Text()
|
||||
link_status = column.Text()
|
||||
link_originality = column.Float()
|
||||
body_size = column.Integer()
|
||||
update_time = column.DateTime()
|
||||
|
||||
|
||||
class DailyLinks(Model):
|
||||
day = column.Integer(primary_key=True)
|
||||
domain_name = column.Text(primary_key=True)
|
||||
url_path = column.Text(primary_key=True)
|
||||
url_query = column.Text(primary_key=True)
|
||||
url_schema = column.Text()
|
||||
redirect_target = column.Text()
|
||||
link_status = column.Text()
|
||||
link_originality = column.Float()
|
||||
body_size = column.Integer()
|
||||
update_time = column.DateTime()
|
||||
|
||||
|
||||
CREATE TABLE domain_quality (
|
||||
domain_name TEXT,
|
||||
day DATE,
|
||||
seen_count INT,
|
||||
good_size INT,
|
||||
good_count INT,
|
||||
good_probability FLOAT,
|
||||
good_originality FLOAT,
|
||||
average_good_characters FLOAT,
|
||||
content_size INT,
|
||||
content_count INT,
|
||||
content_probability FLOAT,
|
||||
content_originality FLOAT,
|
||||
average_content_characters FLOAT,
|
||||
fetched_count INT,
|
||||
average_fetched_good_characters FLOAT,
|
||||
gain_ratio FLOAT,
|
||||
update_time TIMESTAMP STATIC ,
|
||||
PRIMARY KEY(domain_name,day)
|
||||
) WITH CLUSTERING ORDER BY (day DESC);
|
||||
|
||||
|
||||
CREATE TABLE content (
|
||||
domain_name TEXT,
|
||||
target_link TEXT,
|
||||
agent_version TEXT,
|
||||
title TEXT,
|
||||
links SET<TEXT>,
|
||||
authors SET<TEXT>,
|
||||
tags SET<TEXT>,
|
||||
description TEXT,
|
||||
section TEXT,
|
||||
article_published_time TEXT,
|
||||
text_date TEXT,
|
||||
body TEXT,
|
||||
body_size INT,
|
||||
update_time TIMESTAMP,
|
||||
PRIMARY KEY(domain_name,target_link),
|
||||
);
|
||||
|
||||
CREATE TABLE paragraph_checksums (
|
||||
checksum BIGINT,
|
||||
url_hash BIGINT,
|
||||
PRIMARY KEY(checksum,url_hash),
|
||||
);
|
||||
|
||||
CREATE TABLE html (
|
||||
day DATE,
|
||||
domain_name TEXT,
|
||||
source_link TEXT,
|
||||
target_link TEXT,
|
||||
redirect_links LIST<TEXT>,
|
||||
status INT,
|
||||
content TEXT,
|
||||
headers TEXT,
|
||||
agent_version TEXT,
|
||||
update_time TIMESTAMP,
|
||||
PRIMARY KEY(day,domain_name,source_link)
|
||||
);
|
||||
|
||||
CREATE TABLE domain_connections (
|
||||
domain_name TEXT,
|
||||
linked_domain TEXT,
|
||||
PRIMARY KEY (domain_name,linked_domain)
|
||||
);
|
Loading…
Reference in New Issue
Block a user