This commit is contained in:
Daniel Hládek 2023-02-28 13:57:13 +01:00
parent 135fe2b1f6
commit 30d51944f4
4 changed files with 25 additions and 24 deletions

View File

@ -306,6 +306,7 @@ class ParsedDocument:
self.article_published_time = None self.article_published_time = None
self.current_time = datetime.date.today() self.current_time = datetime.date.today()
def extract(self,content,bs): def extract(self,content,bs):
""" """
Parse content and fill the object Parse content and fill the object
@ -321,7 +322,7 @@ class ParsedDocument:
self.paragraph_sizes = pszs self.paragraph_sizes = pszs
if bs is None: if bs is None:
return return
self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs) self.tags,self.authors,self.title,self.article_published_time, self.description,self.section = self.parser.extract_meta(bs)
# Extrakcia linkov zo stranky # Extrakcia linkov zo stranky
base = self.work_link base = self.work_link
@ -361,6 +362,8 @@ class ParsedDocument:
def __str__(self): def __str__(self):
r = [] r = []
if self.authors is not None:
r.append(",".join(self.authors))
if self.title is not None: if self.title is not None:
r.append(self.title) r.append(self.title)
if self.body is not None: if self.body is not None:

View File

@ -193,16 +193,16 @@ def summary(ctx):
@cli.command(help="Create database") @cli.command(help="Create database")
@click.pass_context @click.pass_context
@click.argument("replication",default=1) @click.argument("replication",default=1)
@click.argument("strategy",default="SimpleStrategy") def create_database(ctx,replication):
def create_database(ctx,replication,strategy):
cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"]) cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"])
with cluster.connect() as session: with cluster.connect() as session:
session.execute("DROP keyspace IF EXISTS {}".format(ctx.obj["cassandra_keyspace"])) import cassandra.cqlengine.connection
repo = "{{'class':'{}','replication_factor':{}}}".format(strategy,replication) import cassandra.cqlengine.management as man
query = "CREATE KEYSPACE {} WITH replication = {}".format(ctx.obj["cassandra_keyspace"],repo)
print(query)
session.execute(query)
session.set_keyspace(ctx.obj["cassandra_keyspace"]) session.set_keyspace(ctx.obj["cassandra_keyspace"])
cassandra.cqlengine.connection.set_session(session)
keyspace = ctx.obj["cassandra_keyspace"]
man.drop_keyspace(keyspace)
man.create_keyspace_simple(keyspace,replication)
websucker.schema.create_database(ctx.obj["cassandra_keyspace"],session) websucker.schema.create_database(ctx.obj["cassandra_keyspace"],session)
@cli.command(help="Print keyspace schema") @cli.command(help="Print keyspace schema")

View File

@ -9,7 +9,7 @@ class Links(Model):
url_query = columns.Text(primary_key=True) url_query = columns.Text(primary_key=True)
url_schema = columns.Text() url_schema = columns.Text()
redirect_target = columns.Text() redirect_target = columns.Text()
link_status = columns.Text() link_status = columns.Text(index=True)
link_originality = columns.Float() link_originality = columns.Float()
body_size = columns.Integer() body_size = columns.Integer()
update_time = columns.DateTime() update_time = columns.DateTime()
@ -17,19 +17,17 @@ class Links(Model):
class DailyLinks(Model): class DailyLinks(Model):
__table_name__ = "daily_links" __table_name__ = "daily_links"
day = columns.Integer(primary_key=True) day = columns.Date(primary_key=True)
domain_name = columns.Text(primary_key=True) domain_name = columns.Text(primary_key=True)
link_status = columns.Text(primary_key=True)
url_path = columns.Text(primary_key=True) url_path = columns.Text(primary_key=True)
url_query = columns.Text(primary_key=True) url_query = columns.Text(primary_key=True)
url_schema = columns.Text()
redirect_target = columns.Text()
link_status = columns.Text()
link_originality = columns.Float() link_originality = columns.Float()
body_size = columns.Integer() body_size = columns.Integer()
update_time = columns.DateTime() update_time = columns.DateTime()
class DomainQuality: class DomainQuality(Model):
__table_name__ = "domain_quality" __table_name__ = "domain_quality"
domain_name = columns.Text(primary_key=True) domain_name = columns.Text(primary_key=True)
day = columns.Date(primary_key=True) day = columns.Date(primary_key=True)
@ -47,7 +45,7 @@ class DomainQuality:
fetched_count = columns.Integer() fetched_count = columns.Integer()
average_fetched_good_characters = columns.Float() average_fetched_good_characters = columns.Float()
gain_ratio = columns.Float() gain_ratio = columns.Float()
update_time = columns.TimeUUID(static=True) #TIMESTAMP STATIC , update_time = columns.DateTime(static=True) #TIMESTAMP STATIC ,
#) WITH CLUSTERING ORDER BY (day DESC); #) WITH CLUSTERING ORDER BY (day DESC);
class Content(Model): class Content(Model):
@ -84,7 +82,7 @@ class Html(Model):
content = columns.Text() content = columns.Text()
headers = columns.Text() headers = columns.Text()
agent_version = columns.Text() agent_version = columns.Text()
update_time = columns.Text() update_time = columns.DateTime()
class DomainConnections(Model): class DomainConnections(Model):
__table_name__ = "domain_connections" __table_name__ = "domain_connections"
@ -92,10 +90,10 @@ class DomainConnections(Model):
linked_domain = columns.Text(primary_key=True) linked_domain = columns.Text(primary_key=True)
def create_database(keyspace,session): def create_database(keyspace,session):
sync_table(Links,keyspaces=[keyspace],connections=[session]) sync_table(Links)
sync_table(DailyLinks,keyspaces=[keyspace],connections=[session]) sync_table(DailyLinks)
sync_table(DomainQuality,keyspaces=[keyspace],connections=[session]) sync_table(DomainQuality)
sync_table(Content,keyspaces=[keyspace],connections=[session]) sync_table(Content)
sync_table(ParagraphChecksums,keyspaces=[keyspace],connections=[session]) sync_table(ParagraphChecksums)
sync_table(Html,keyspaces=[keyspace],connections=[session]) sync_table(Html)
sync_table(DomainConnections,keyspaces=[keyspace],connections=[session]) sync_table(DomainConnections)

View File

@ -23,9 +23,9 @@ CREATE INDEX link_status_index ON links(link_status);
CREATE TABLE daily_links ( CREATE TABLE daily_links (
day DATE, day DATE,
domain_name TEXT, domain_name TEXT,
link_status TEXT,
url_path TEXT, url_path TEXT,
url_query TEXT, url_query TEXT,
link_status TEXT,
body_size INT, body_size INT,
link_originality FLOAT, link_originality FLOAT,
update_time TIMESTAMP, update_time TIMESTAMP,