wip schema
This commit is contained in:
parent
56bf59e5e9
commit
1e7ac17d90
93
websucker/schema.py
Normal file
93
websucker/schema.py
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
from cassandra.cqlengine import columns
|
||||||
|
from cassandra.cqlengine.models import Model
|
||||||
|
|
||||||
|
class Links(Model):
|
||||||
|
domain_name = column.Text(primary_key=True)
|
||||||
|
url_path = column.Text(primary_key=True)
|
||||||
|
url_query = column.Text(primary_key=True)
|
||||||
|
url_schema = column.Text()
|
||||||
|
redirect_target = column.Text()
|
||||||
|
link_status = column.Text()
|
||||||
|
link_originality = column.Float()
|
||||||
|
body_size = column.Integer()
|
||||||
|
update_time = column.DateTime()
|
||||||
|
|
||||||
|
|
||||||
|
class DailyLinks(Model):
|
||||||
|
day = column.Integer(primary_key=True)
|
||||||
|
domain_name = column.Text(primary_key=True)
|
||||||
|
url_path = column.Text(primary_key=True)
|
||||||
|
url_query = column.Text(primary_key=True)
|
||||||
|
url_schema = column.Text()
|
||||||
|
redirect_target = column.Text()
|
||||||
|
link_status = column.Text()
|
||||||
|
link_originality = column.Float()
|
||||||
|
body_size = column.Integer()
|
||||||
|
update_time = column.DateTime()
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE domain_quality (
|
||||||
|
domain_name TEXT,
|
||||||
|
day DATE,
|
||||||
|
seen_count INT,
|
||||||
|
good_size INT,
|
||||||
|
good_count INT,
|
||||||
|
good_probability FLOAT,
|
||||||
|
good_originality FLOAT,
|
||||||
|
average_good_characters FLOAT,
|
||||||
|
content_size INT,
|
||||||
|
content_count INT,
|
||||||
|
content_probability FLOAT,
|
||||||
|
content_originality FLOAT,
|
||||||
|
average_content_characters FLOAT,
|
||||||
|
fetched_count INT,
|
||||||
|
average_fetched_good_characters FLOAT,
|
||||||
|
gain_ratio FLOAT,
|
||||||
|
update_time TIMESTAMP STATIC ,
|
||||||
|
PRIMARY KEY(domain_name,day)
|
||||||
|
) WITH CLUSTERING ORDER BY (day DESC);
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE content (
|
||||||
|
domain_name TEXT,
|
||||||
|
target_link TEXT,
|
||||||
|
agent_version TEXT,
|
||||||
|
title TEXT,
|
||||||
|
links SET<TEXT>,
|
||||||
|
authors SET<TEXT>,
|
||||||
|
tags SET<TEXT>,
|
||||||
|
description TEXT,
|
||||||
|
section TEXT,
|
||||||
|
article_published_time TEXT,
|
||||||
|
text_date TEXT,
|
||||||
|
body TEXT,
|
||||||
|
body_size INT,
|
||||||
|
update_time TIMESTAMP,
|
||||||
|
PRIMARY KEY(domain_name,target_link),
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE paragraph_checksums (
|
||||||
|
checksum BIGINT,
|
||||||
|
url_hash BIGINT,
|
||||||
|
PRIMARY KEY(checksum,url_hash),
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE html (
|
||||||
|
day DATE,
|
||||||
|
domain_name TEXT,
|
||||||
|
source_link TEXT,
|
||||||
|
target_link TEXT,
|
||||||
|
redirect_links LIST<TEXT>,
|
||||||
|
status INT,
|
||||||
|
content TEXT,
|
||||||
|
headers TEXT,
|
||||||
|
agent_version TEXT,
|
||||||
|
update_time TIMESTAMP,
|
||||||
|
PRIMARY KEY(day,domain_name,source_link)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE domain_connections (
|
||||||
|
domain_name TEXT,
|
||||||
|
linked_domain TEXT,
|
||||||
|
PRIMARY KEY (domain_name,linked_domain)
|
||||||
|
);
|
Loading…
Reference in New Issue
Block a user