95 lines
2.0 KiB
SQL
95 lines
2.0 KiB
SQL
DROP KEYSPACE websucker;
|
|
|
|
CREATE KEYSPACE websucker
|
|
WITH replication = {'class':'SimpleStrategy', 'replication_factor' : 1};
|
|
|
|
USE websucker;
|
|
|
|
CREATE TABLE links (
|
|
domain_name TEXT,
|
|
url_path TEXT,
|
|
url_query TEXT,
|
|
url_schema TEXT,
|
|
redirect_target TEXT,
|
|
link_status TEXT,
|
|
link_originality FLOAT,
|
|
body_size INT,
|
|
update_time TIMESTAMP,
|
|
PRIMARY KEY(domain_name,url_path,url_query)
|
|
);
|
|
|
|
CREATE INDEX link_status_index ON links(link_status);
|
|
|
|
CREATE TABLE daily_links (
|
|
day DATE,
|
|
domain_name TEXT,
|
|
url_path TEXT,
|
|
url_query TEXT,
|
|
link_status TEXT,
|
|
body_size INT,
|
|
link_originality FLOAT,
|
|
update_time TIMESTAMP,
|
|
PRIMARY KEY(day,domain_name,link_status,url_path,url_query)
|
|
);
|
|
|
|
CREATE TABLE domain_quality (
|
|
domain_name TEXT,
|
|
day DATE,
|
|
seen_count INT,
|
|
good_size INT,
|
|
good_count INT,
|
|
good_probability FLOAT,
|
|
good_originality FLOAT,
|
|
average_good_characters FLOAT,
|
|
content_size INT,
|
|
content_count INT,
|
|
content_probability FLOAT,
|
|
content_originality FLOAT,
|
|
average_content_characters FLOAT,
|
|
fetched_count INT,
|
|
average_fetched_good_characters FLOAT,
|
|
gain_ratio FLOAT,
|
|
update_time TIMESTAMP STATIC ,
|
|
PRIMARY KEY(domain_name,day)
|
|
) WITH CLUSTERING ORDER BY (day DESC);
|
|
|
|
|
|
CREATE TABLE content (
|
|
domain_name TEXT,
|
|
target_link TEXT,
|
|
agent_version TEXT,
|
|
title TEXT,
|
|
links SET<TEXT>,
|
|
authors SET<TEXT>,
|
|
tags SET<TEXT>,
|
|
description TEXT,
|
|
section TEXT,
|
|
article_published_time TEXT,
|
|
text_date TEXT,
|
|
body TEXT,
|
|
body_size INT,
|
|
update_time TIMESTAMP,
|
|
PRIMARY KEY(domain_name,target_link),
|
|
);
|
|
|
|
CREATE TABLE paragraph_checksums (
|
|
checksum BIGINT,
|
|
url_hash BIGINT,
|
|
PRIMARY KEY(checksum,url_hash),
|
|
);
|
|
|
|
CREATE TABLE html (
|
|
day DATE,
|
|
domain_name TEXT,
|
|
source_link TEXT,
|
|
target_link TEXT,
|
|
redirect_links LIST<TEXT>,
|
|
status INT,
|
|
content TEXT,
|
|
headers TEXT,
|
|
agent_version TEXT,
|
|
update_time TIMESTAMP,
|
|
PRIMARY KEY(day,domain_name,source_link)
|
|
);
|
|
|