zz
This commit is contained in:
parent
135fe2b1f6
commit
30d51944f4
@ -306,6 +306,7 @@ class ParsedDocument:
|
||||
self.article_published_time = None
|
||||
self.current_time = datetime.date.today()
|
||||
|
||||
|
||||
def extract(self,content,bs):
|
||||
"""
|
||||
Parse content and fill the object
|
||||
@ -321,7 +322,7 @@ class ParsedDocument:
|
||||
self.paragraph_sizes = pszs
|
||||
if bs is None:
|
||||
return
|
||||
self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs)
|
||||
self.tags,self.authors,self.title,self.article_published_time, self.description,self.section = self.parser.extract_meta(bs)
|
||||
|
||||
# Extrakcia linkov zo stranky
|
||||
base = self.work_link
|
||||
@ -361,6 +362,8 @@ class ParsedDocument:
|
||||
|
||||
def __str__(self):
|
||||
r = []
|
||||
if self.authors is not None:
|
||||
r.append(",".join(self.authors))
|
||||
if self.title is not None:
|
||||
r.append(self.title)
|
||||
if self.body is not None:
|
||||
|
@ -193,16 +193,16 @@ def summary(ctx):
|
||||
@cli.command(help="Create database")
|
||||
@click.pass_context
|
||||
@click.argument("replication",default=1)
|
||||
@click.argument("strategy",default="SimpleStrategy")
|
||||
def create_database(ctx,replication,strategy):
|
||||
def create_database(ctx,replication):
|
||||
cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"])
|
||||
with cluster.connect() as session:
|
||||
session.execute("DROP keyspace IF EXISTS {}".format(ctx.obj["cassandra_keyspace"]))
|
||||
repo = "{{'class':'{}','replication_factor':{}}}".format(strategy,replication)
|
||||
query = "CREATE KEYSPACE {} WITH replication = {}".format(ctx.obj["cassandra_keyspace"],repo)
|
||||
print(query)
|
||||
session.execute(query)
|
||||
import cassandra.cqlengine.connection
|
||||
import cassandra.cqlengine.management as man
|
||||
session.set_keyspace(ctx.obj["cassandra_keyspace"])
|
||||
cassandra.cqlengine.connection.set_session(session)
|
||||
keyspace = ctx.obj["cassandra_keyspace"]
|
||||
man.drop_keyspace(keyspace)
|
||||
man.create_keyspace_simple(keyspace,replication)
|
||||
websucker.schema.create_database(ctx.obj["cassandra_keyspace"],session)
|
||||
|
||||
@cli.command(help="Print keyspace schema")
|
||||
|
@ -9,7 +9,7 @@ class Links(Model):
|
||||
url_query = columns.Text(primary_key=True)
|
||||
url_schema = columns.Text()
|
||||
redirect_target = columns.Text()
|
||||
link_status = columns.Text()
|
||||
link_status = columns.Text(index=True)
|
||||
link_originality = columns.Float()
|
||||
body_size = columns.Integer()
|
||||
update_time = columns.DateTime()
|
||||
@ -17,19 +17,17 @@ class Links(Model):
|
||||
|
||||
class DailyLinks(Model):
|
||||
__table_name__ = "daily_links"
|
||||
day = columns.Integer(primary_key=True)
|
||||
day = columns.Date(primary_key=True)
|
||||
domain_name = columns.Text(primary_key=True)
|
||||
link_status = columns.Text(primary_key=True)
|
||||
url_path = columns.Text(primary_key=True)
|
||||
url_query = columns.Text(primary_key=True)
|
||||
url_schema = columns.Text()
|
||||
redirect_target = columns.Text()
|
||||
link_status = columns.Text()
|
||||
link_originality = columns.Float()
|
||||
body_size = columns.Integer()
|
||||
update_time = columns.DateTime()
|
||||
|
||||
|
||||
class DomainQuality:
|
||||
class DomainQuality(Model):
|
||||
__table_name__ = "domain_quality"
|
||||
domain_name = columns.Text(primary_key=True)
|
||||
day = columns.Date(primary_key=True)
|
||||
@ -47,7 +45,7 @@ class DomainQuality:
|
||||
fetched_count = columns.Integer()
|
||||
average_fetched_good_characters = columns.Float()
|
||||
gain_ratio = columns.Float()
|
||||
update_time = columns.TimeUUID(static=True) #TIMESTAMP STATIC ,
|
||||
update_time = columns.DateTime(static=True) #TIMESTAMP STATIC ,
|
||||
#) WITH CLUSTERING ORDER BY (day DESC);
|
||||
|
||||
class Content(Model):
|
||||
@ -84,7 +82,7 @@ class Html(Model):
|
||||
content = columns.Text()
|
||||
headers = columns.Text()
|
||||
agent_version = columns.Text()
|
||||
update_time = columns.Text()
|
||||
update_time = columns.DateTime()
|
||||
|
||||
class DomainConnections(Model):
|
||||
__table_name__ = "domain_connections"
|
||||
@ -92,10 +90,10 @@ class DomainConnections(Model):
|
||||
linked_domain = columns.Text(primary_key=True)
|
||||
|
||||
def create_database(keyspace,session):
|
||||
sync_table(Links,keyspaces=[keyspace],connections=[session])
|
||||
sync_table(DailyLinks,keyspaces=[keyspace],connections=[session])
|
||||
sync_table(DomainQuality,keyspaces=[keyspace],connections=[session])
|
||||
sync_table(Content,keyspaces=[keyspace],connections=[session])
|
||||
sync_table(ParagraphChecksums,keyspaces=[keyspace],connections=[session])
|
||||
sync_table(Html,keyspaces=[keyspace],connections=[session])
|
||||
sync_table(DomainConnections,keyspaces=[keyspace],connections=[session])
|
||||
sync_table(Links)
|
||||
sync_table(DailyLinks)
|
||||
sync_table(DomainQuality)
|
||||
sync_table(Content)
|
||||
sync_table(ParagraphChecksums)
|
||||
sync_table(Html)
|
||||
sync_table(DomainConnections)
|
||||
|
@ -23,9 +23,9 @@ CREATE INDEX link_status_index ON links(link_status);
|
||||
CREATE TABLE daily_links (
|
||||
day DATE,
|
||||
domain_name TEXT,
|
||||
link_status TEXT,
|
||||
url_path TEXT,
|
||||
url_query TEXT,
|
||||
link_status TEXT,
|
||||
body_size INT,
|
||||
link_originality FLOAT,
|
||||
update_time TIMESTAMP,
|
||||
|
Loading…
Reference in New Issue
Block a user