From 30d51944f43a81260be1eab7cd7b8e028402f479 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Tue, 28 Feb 2023 13:57:13 +0100 Subject: [PATCH] zz --- websucker/agent.py | 5 ++++- websucker/cli.py | 14 +++++++------- websucker/schema.py | 28 +++++++++++++--------------- websucker/schema.sql | 2 +- 4 files changed, 25 insertions(+), 24 deletions(-) diff --git a/websucker/agent.py b/websucker/agent.py index e3f8e8e..96a050e 100755 --- a/websucker/agent.py +++ b/websucker/agent.py @@ -306,6 +306,7 @@ class ParsedDocument: self.article_published_time = None self.current_time = datetime.date.today() + def extract(self,content,bs): """ Parse content and fill the object @@ -321,7 +322,7 @@ class ParsedDocument: self.paragraph_sizes = pszs if bs is None: return - self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs) + self.tags,self.authors,self.title,self.article_published_time, self.description,self.section = self.parser.extract_meta(bs) # Extrakcia linkov zo stranky base = self.work_link @@ -361,6 +362,8 @@ class ParsedDocument: def __str__(self): r = [] + if self.authors is not None: + r.append(",".join(self.authors)) if self.title is not None: r.append(self.title) if self.body is not None: diff --git a/websucker/cli.py b/websucker/cli.py index 32a1a41..dcdeae8 100644 --- a/websucker/cli.py +++ b/websucker/cli.py @@ -193,16 +193,16 @@ def summary(ctx): @cli.command(help="Create database") @click.pass_context @click.argument("replication",default=1) -@click.argument("strategy",default="SimpleStrategy") -def create_database(ctx,replication,strategy): +def create_database(ctx,replication): cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"]) with cluster.connect() as session: - session.execute("DROP keyspace IF EXISTS {}".format(ctx.obj["cassandra_keyspace"])) - repo = "{{'class':'{}','replication_factor':{}}}".format(strategy,replication) - query = "CREATE KEYSPACE {} WITH replication = {}".format(ctx.obj["cassandra_keyspace"],repo) - print(query) - session.execute(query) + import cassandra.cqlengine.connection + import cassandra.cqlengine.management as man session.set_keyspace(ctx.obj["cassandra_keyspace"]) + cassandra.cqlengine.connection.set_session(session) + keyspace = ctx.obj["cassandra_keyspace"] + man.drop_keyspace(keyspace) + man.create_keyspace_simple(keyspace,replication) websucker.schema.create_database(ctx.obj["cassandra_keyspace"],session) @cli.command(help="Print keyspace schema") diff --git a/websucker/schema.py b/websucker/schema.py index 8a15b2c..cb81f7f 100644 --- a/websucker/schema.py +++ b/websucker/schema.py @@ -9,7 +9,7 @@ class Links(Model): url_query = columns.Text(primary_key=True) url_schema = columns.Text() redirect_target = columns.Text() - link_status = columns.Text() + link_status = columns.Text(index=True) link_originality = columns.Float() body_size = columns.Integer() update_time = columns.DateTime() @@ -17,19 +17,17 @@ class Links(Model): class DailyLinks(Model): __table_name__ = "daily_links" - day = columns.Integer(primary_key=True) + day = columns.Date(primary_key=True) domain_name = columns.Text(primary_key=True) + link_status = columns.Text(primary_key=True) url_path = columns.Text(primary_key=True) url_query = columns.Text(primary_key=True) - url_schema = columns.Text() - redirect_target = columns.Text() - link_status = columns.Text() link_originality = columns.Float() body_size = columns.Integer() update_time = columns.DateTime() -class DomainQuality: +class DomainQuality(Model): __table_name__ = "domain_quality" domain_name = columns.Text(primary_key=True) day = columns.Date(primary_key=True) @@ -47,7 +45,7 @@ class DomainQuality: fetched_count = columns.Integer() average_fetched_good_characters = columns.Float() gain_ratio = columns.Float() - update_time = columns.TimeUUID(static=True) #TIMESTAMP STATIC , + update_time = columns.DateTime(static=True) #TIMESTAMP STATIC , #) WITH CLUSTERING ORDER BY (day DESC); class Content(Model): @@ -84,7 +82,7 @@ class Html(Model): content = columns.Text() headers = columns.Text() agent_version = columns.Text() - update_time = columns.Text() + update_time = columns.DateTime() class DomainConnections(Model): __table_name__ = "domain_connections" @@ -92,10 +90,10 @@ class DomainConnections(Model): linked_domain = columns.Text(primary_key=True) def create_database(keyspace,session): - sync_table(Links,keyspaces=[keyspace],connections=[session]) - sync_table(DailyLinks,keyspaces=[keyspace],connections=[session]) - sync_table(DomainQuality,keyspaces=[keyspace],connections=[session]) - sync_table(Content,keyspaces=[keyspace],connections=[session]) - sync_table(ParagraphChecksums,keyspaces=[keyspace],connections=[session]) - sync_table(Html,keyspaces=[keyspace],connections=[session]) - sync_table(DomainConnections,keyspaces=[keyspace],connections=[session]) + sync_table(Links) + sync_table(DailyLinks) + sync_table(DomainQuality) + sync_table(Content) + sync_table(ParagraphChecksums) + sync_table(Html) + sync_table(DomainConnections) diff --git a/websucker/schema.sql b/websucker/schema.sql index 783757b..3ac348c 100644 --- a/websucker/schema.sql +++ b/websucker/schema.sql @@ -23,9 +23,9 @@ CREATE INDEX link_status_index ON links(link_status); CREATE TABLE daily_links ( day DATE, domain_name TEXT, + link_status TEXT, url_path TEXT, url_query TEXT, - link_status TEXT, body_size INT, link_originality FLOAT, update_time TIMESTAMP,