This commit is contained in:
Daniel Hládek 2023-02-28 08:56:35 +01:00
parent bd6e8e40ed
commit eb69ad8b7c
4 changed files with 118 additions and 91 deletions

View File

@ -1,10 +1,10 @@
version: '2' version: '3'
services: services:
cassandra: cassandra:
image: bitnami/cassandra:3.11 image: bitnami/cassandra:3.11
environment: environment:
- CASSANDRA_CLUSTER_NAME=cassandra-cluster - CASSANDRA_CLUSTER_NAME=cassandra-cluster
ports: ports:
- "89042:9042" - "8942:9042"
- "87000:7000" - "8700:7000"

View File

@ -5,11 +5,14 @@ from websucker.parser import normalize_link,urlunparse
from websucker.parser import load_parser from websucker.parser import load_parser
from websucker.db import Data from websucker.db import Data
from websucker.db import get_schema from websucker.db import get_schema
import websucker.db
import click import click
import pprint import pprint
import greenstalk import greenstalk
import os import os
from websucker import schema
def create_database_from_context(ctx): def create_database_from_context(ctx):
return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"]) return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"])
@ -23,8 +26,8 @@ def create_queue_from_context(ctx):
@click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True) @click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True)
@click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True) @click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True)
@click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True) @click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True)
@click.option("--cassandra-username",metavar="CASSANDRA_USERNAME",help="cassandra username (if defined, value read from CASSANDRA_USERNAME env variable)",envvar="CASSANDRA_USERNAME") @click.option("--cassandra-username",metavar="CASSANDRA_USERNAME",help="cassandra username (if defined, value read from CASSANDRA_USERNAME env variable)",envvar="CASSANDRA_USERNAME",default="cassandra",show_default=True)
@click.option("--cassandra-password",metavar="CASSANDRA_PASSWORD",help="cassandra password (if defined, value read from CASSANDRA_PASSWORD env variable)",envvar="CASSANDRA_PASSWORD") @click.option("--cassandra-password",metavar="CASSANDRA_PASSWORD",help="cassandra password (if defined, value read from CASSANDRA_PASSWORD env variable)",envvar="CASSANDRA_PASSWORD",default="cassandra",show_default=True)
@click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True) @click.option("--beanstalkd-tube",metavar="BEANSTALKD_TUBE",help="beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable)",envvar="BEANSTALKD_TUBE",default="websucker",show_default=True)
@click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True) @click.option("--beanstalkd-host",metavar="BEANSTALKD_HOST",help="beanstalkd host (if defined, value read from beanstalkd_HOST env variable)",envvar="BEANSTALKD_HOST",default="127.0.0.1",show_default=True)
@click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True) @click.option("--beanstalkd-port",metavar="BEANSTALKD_PORT",help="beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable)",envvar="BEANSTALKD_PORT",default=11300,show_default=True)
@ -187,6 +190,18 @@ def summary(ctx):
p = ctx.obj["parser"] p = ctx.obj["parser"]
db.summary(p) db.summary(p)
@cli.command(help="Create database")
@click.pass_context
@click.argument("replication",default=1)
@click.argument("strategy",default="SimpleStrategy")
def create_database(ctx,replication,strategy):
cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"])
with cluster.connect() as session:
query = "CREATE KEYSPACE {} WITH replication = {'class':'{}', 'replication_factor' : {}}".format(ctx.obj["cassandra_keyspace"],strategy,replication)
session.execute(query)
session.set_keyspace(ctx.obj["cassandra_keyspace"])
model.create_database()
@cli.command(help="Print keyspace schema") @cli.command(help="Print keyspace schema")
def schema(): def schema():
schema = get_schema() schema = get_schema()

View File

@ -19,19 +19,23 @@ def get_schema():
schema = f.read() schema = f.read()
return str(schema,encoding="utf8") return str(schema,encoding="utf8")
class Data: def connect_cluster(cassandra_host,cassandra_port,username,password):
"""
Database of text documents
"""
def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042,username=None,password=None):
print("Database {}@{}:{}".format(keyspace,cassandra_host, cassandra_port))
auth_provider = None auth_provider = None
if username is not None and password is not None: if username is not None and password is not None:
auth_provider = PlainTextAuthProvider(username=username, password=password) auth_provider = PlainTextAuthProvider(username=username, password=password)
# execution profile # execution profile
ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0) ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0)
profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep} profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep}
self.cluster = cassandra.cluster.Cluster([cassandra_host],port=cassandra_port,execution_profiles=profiles,auth_provider=auth_provider) cluster = cassandra.cluster.Cluster([cassandra_host],port=cassandra_port,execution_profiles=profiles,auth_provider=auth_provider)
return cluster
class Data:
"""
Database of text documents
"""
def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042,username=None,password=None):
print("Database {}@{}:{}".format(keyspace,cassandra_host, cassandra_port))
self.cluster = connect_cluster(cassandra_host,cassandra_port,username,password)
self.session = self.cluster.connect(keyspace) self.session = self.cluster.connect(keyspace)
self.check_document_select_query = self.session.prepare("SELECT count(url_hash) FROM paragraph_checksums WHERE checksum=?" ) self.check_document_select_query = self.session.prepare("SELECT count(url_hash) FROM paragraph_checksums WHERE checksum=?" )

View File

@ -1,93 +1,101 @@
from cassandra.cqlengine import columns from cassandra.cqlengine import columns
from cassandra.cqlengine.models import Model from cassandra.cqlengine.models import Model
from cassandra.cqlengine.management import sync_table
class Links(Model): class Links(Model):
domain_name = column.Text(primary_key=True) __table_name__ = "links"
url_path = column.Text(primary_key=True) domain_name = columns.Text(primary_key=True)
url_query = column.Text(primary_key=True) url_path = columns.Text(primary_key=True)
url_schema = column.Text() url_query = columns.Text(primary_key=True)
redirect_target = column.Text() url_schema = columns.Text()
link_status = column.Text() redirect_target = columns.Text()
link_originality = column.Float() link_status = columns.Text()
body_size = column.Integer() link_originality = columns.Float()
update_time = column.DateTime() body_size = columns.Integer()
update_time = columns.DateTime()
class DailyLinks(Model): class DailyLinks(Model):
day = column.Integer(primary_key=True) __table_name__ = "daily_links"
domain_name = column.Text(primary_key=True) day = columns.Integer(primary_key=True)
url_path = column.Text(primary_key=True) domain_name = columns.Text(primary_key=True)
url_query = column.Text(primary_key=True) url_path = columns.Text(primary_key=True)
url_schema = column.Text() url_query = columns.Text(primary_key=True)
redirect_target = column.Text() url_schema = columns.Text()
link_status = column.Text() redirect_target = columns.Text()
link_originality = column.Float() link_status = columns.Text()
body_size = column.Integer() link_originality = columns.Float()
update_time = column.DateTime() body_size = columns.Integer()
update_time = columns.DateTime()
CREATE TABLE domain_quality ( class DomainQuality:
domain_name TEXT, __table_name__ = "domain_quality"
day DATE, domain_name = columns.Text(primary_key=True)
seen_count INT, day = columns.Date(primary_key=True)
good_size INT, seen_count = columns.Integer()
good_count INT, good_size = columns.Integer()
good_probability FLOAT, good_count = columns.Integer()
good_originality FLOAT, good_probability = columns.Float()
average_good_characters FLOAT, good_originality = columns.Float()
content_size INT, average_good_characters = columns.Float()
content_count INT, content_size = columns.Integer()
content_probability FLOAT, content_count = columns.Integer()
content_originality FLOAT, content_probability = columns.Float()
average_content_characters FLOAT, content_originality = columns.Float()
fetched_count INT, average_content_characters = columns.Float()
average_fetched_good_characters FLOAT, fetched_count = columns.Integer()
gain_ratio FLOAT, average_fetched_good_characters = columns.Float()
update_time TIMESTAMP STATIC , gain_ratio = columns.Float()
PRIMARY KEY(domain_name,day) update_time = columns.TimeUUID(static=True) #TIMESTAMP STATIC ,
) WITH CLUSTERING ORDER BY (day DESC); #) WITH CLUSTERING ORDER BY (day DESC);
class Content(Model):
__table_name__ = "content"
domain_name = columns.Text(primary_key=True)
target_link = columns.Text(primary_key=True)
agent_version = columns.Text()
title = columns.Text()
links = columns.Set(value_type=columns.Text)
authors = columns.Set(value_type=columns.Text)
tags = columns.Set(value_type=columns.Text)
description = columns.Text()
section = columns.Text()
article_published_time = columns.Text()
text_date = columns.Text()
body = columns.Text()
body_size = columns.Text()
update_time = columns.DateTime()
# PRIMARY KEY(domain_name,target_link),
CREATE TABLE content ( class ParagraphChecksums(Model):
domain_name TEXT, __table_name__ = "paragraph_checksums"
target_link TEXT, checksum = columns.BigInt(primary_key=True)
agent_version TEXT, url_hash = columns.BigInt(primary_key=True)
title TEXT,
links SET<TEXT>,
authors SET<TEXT>,
tags SET<TEXT>,
description TEXT,
section TEXT,
article_published_time TEXT,
text_date TEXT,
body TEXT,
body_size INT,
update_time TIMESTAMP,
PRIMARY KEY(domain_name,target_link),
);
CREATE TABLE paragraph_checksums ( class Html(Model):
checksum BIGINT, __table_name__ = "html"
url_hash BIGINT, day = columns.Date(primary_key=True)
PRIMARY KEY(checksum,url_hash), domain_name = columns.Text(primary_key=True)
); source_link = columns.Text(primary_key=True)
target_link = columns.Text()
redirect_links = columns.List(value_type=columns.Text)
status = columns.Integer()
content = columns.Text()
headers = columns.Text()
agent_version = columns.Text()
update_time = columns.Text()
CREATE TABLE html ( class DomainConnections(Model):
day DATE, __table_name__ = "domain_connections"
domain_name TEXT, domain_name = columns.Text(primary_key=True)
source_link TEXT, linked_domain = columns.Text(primary_key=True)
target_link TEXT,
redirect_links LIST<TEXT>,
status INT,
content TEXT,
headers TEXT,
agent_version TEXT,
update_time TIMESTAMP,
PRIMARY KEY(day,domain_name,source_link)
);
CREATE TABLE domain_connections ( def create_db():
domain_name TEXT, sync_table(Links)
linked_domain TEXT, sync_table(DailyLinks)
PRIMARY KEY (domain_name,linked_domain) sync_table(DomainQuality)
); sync_table(Content)
sync_table(ParagraphChecksums)
sync_table(Html)
sync_table(DomainConnections)