94 lines
2.3 KiB
Python
94 lines
2.3 KiB
Python
from cassandra.cqlengine import columns
|
|
from cassandra.cqlengine.models import Model
|
|
|
|
class Links(Model):
|
|
domain_name = column.Text(primary_key=True)
|
|
url_path = column.Text(primary_key=True)
|
|
url_query = column.Text(primary_key=True)
|
|
url_schema = column.Text()
|
|
redirect_target = column.Text()
|
|
link_status = column.Text()
|
|
link_originality = column.Float()
|
|
body_size = column.Integer()
|
|
update_time = column.DateTime()
|
|
|
|
|
|
class DailyLinks(Model):
|
|
day = column.Integer(primary_key=True)
|
|
domain_name = column.Text(primary_key=True)
|
|
url_path = column.Text(primary_key=True)
|
|
url_query = column.Text(primary_key=True)
|
|
url_schema = column.Text()
|
|
redirect_target = column.Text()
|
|
link_status = column.Text()
|
|
link_originality = column.Float()
|
|
body_size = column.Integer()
|
|
update_time = column.DateTime()
|
|
|
|
|
|
CREATE TABLE domain_quality (
|
|
domain_name TEXT,
|
|
day DATE,
|
|
seen_count INT,
|
|
good_size INT,
|
|
good_count INT,
|
|
good_probability FLOAT,
|
|
good_originality FLOAT,
|
|
average_good_characters FLOAT,
|
|
content_size INT,
|
|
content_count INT,
|
|
content_probability FLOAT,
|
|
content_originality FLOAT,
|
|
average_content_characters FLOAT,
|
|
fetched_count INT,
|
|
average_fetched_good_characters FLOAT,
|
|
gain_ratio FLOAT,
|
|
update_time TIMESTAMP STATIC ,
|
|
PRIMARY KEY(domain_name,day)
|
|
) WITH CLUSTERING ORDER BY (day DESC);
|
|
|
|
|
|
CREATE TABLE content (
|
|
domain_name TEXT,
|
|
target_link TEXT,
|
|
agent_version TEXT,
|
|
title TEXT,
|
|
links SET<TEXT>,
|
|
authors SET<TEXT>,
|
|
tags SET<TEXT>,
|
|
description TEXT,
|
|
section TEXT,
|
|
article_published_time TEXT,
|
|
text_date TEXT,
|
|
body TEXT,
|
|
body_size INT,
|
|
update_time TIMESTAMP,
|
|
PRIMARY KEY(domain_name,target_link),
|
|
);
|
|
|
|
CREATE TABLE paragraph_checksums (
|
|
checksum BIGINT,
|
|
url_hash BIGINT,
|
|
PRIMARY KEY(checksum,url_hash),
|
|
);
|
|
|
|
CREATE TABLE html (
|
|
day DATE,
|
|
domain_name TEXT,
|
|
source_link TEXT,
|
|
target_link TEXT,
|
|
redirect_links LIST<TEXT>,
|
|
status INT,
|
|
content TEXT,
|
|
headers TEXT,
|
|
agent_version TEXT,
|
|
update_time TIMESTAMP,
|
|
PRIMARY KEY(day,domain_name,source_link)
|
|
);
|
|
|
|
CREATE TABLE domain_connections (
|
|
domain_name TEXT,
|
|
linked_domain TEXT,
|
|
PRIMARY KEY (domain_name,linked_domain)
|
|
);
|