zz
This commit is contained in:
parent
135fe2b1f6
commit
30d51944f4
@ -306,6 +306,7 @@ class ParsedDocument:
|
|||||||
self.article_published_time = None
|
self.article_published_time = None
|
||||||
self.current_time = datetime.date.today()
|
self.current_time = datetime.date.today()
|
||||||
|
|
||||||
|
|
||||||
def extract(self,content,bs):
|
def extract(self,content,bs):
|
||||||
"""
|
"""
|
||||||
Parse content and fill the object
|
Parse content and fill the object
|
||||||
@ -321,7 +322,7 @@ class ParsedDocument:
|
|||||||
self.paragraph_sizes = pszs
|
self.paragraph_sizes = pszs
|
||||||
if bs is None:
|
if bs is None:
|
||||||
return
|
return
|
||||||
self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs)
|
self.tags,self.authors,self.title,self.article_published_time, self.description,self.section = self.parser.extract_meta(bs)
|
||||||
|
|
||||||
# Extrakcia linkov zo stranky
|
# Extrakcia linkov zo stranky
|
||||||
base = self.work_link
|
base = self.work_link
|
||||||
@ -361,6 +362,8 @@ class ParsedDocument:
|
|||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
r = []
|
r = []
|
||||||
|
if self.authors is not None:
|
||||||
|
r.append(",".join(self.authors))
|
||||||
if self.title is not None:
|
if self.title is not None:
|
||||||
r.append(self.title)
|
r.append(self.title)
|
||||||
if self.body is not None:
|
if self.body is not None:
|
||||||
|
@ -193,16 +193,16 @@ def summary(ctx):
|
|||||||
@cli.command(help="Create database")
|
@cli.command(help="Create database")
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
@click.argument("replication",default=1)
|
@click.argument("replication",default=1)
|
||||||
@click.argument("strategy",default="SimpleStrategy")
|
def create_database(ctx,replication):
|
||||||
def create_database(ctx,replication,strategy):
|
|
||||||
cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"])
|
cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"])
|
||||||
with cluster.connect() as session:
|
with cluster.connect() as session:
|
||||||
session.execute("DROP keyspace IF EXISTS {}".format(ctx.obj["cassandra_keyspace"]))
|
import cassandra.cqlengine.connection
|
||||||
repo = "{{'class':'{}','replication_factor':{}}}".format(strategy,replication)
|
import cassandra.cqlengine.management as man
|
||||||
query = "CREATE KEYSPACE {} WITH replication = {}".format(ctx.obj["cassandra_keyspace"],repo)
|
|
||||||
print(query)
|
|
||||||
session.execute(query)
|
|
||||||
session.set_keyspace(ctx.obj["cassandra_keyspace"])
|
session.set_keyspace(ctx.obj["cassandra_keyspace"])
|
||||||
|
cassandra.cqlengine.connection.set_session(session)
|
||||||
|
keyspace = ctx.obj["cassandra_keyspace"]
|
||||||
|
man.drop_keyspace(keyspace)
|
||||||
|
man.create_keyspace_simple(keyspace,replication)
|
||||||
websucker.schema.create_database(ctx.obj["cassandra_keyspace"],session)
|
websucker.schema.create_database(ctx.obj["cassandra_keyspace"],session)
|
||||||
|
|
||||||
@cli.command(help="Print keyspace schema")
|
@cli.command(help="Print keyspace schema")
|
||||||
|
@ -9,7 +9,7 @@ class Links(Model):
|
|||||||
url_query = columns.Text(primary_key=True)
|
url_query = columns.Text(primary_key=True)
|
||||||
url_schema = columns.Text()
|
url_schema = columns.Text()
|
||||||
redirect_target = columns.Text()
|
redirect_target = columns.Text()
|
||||||
link_status = columns.Text()
|
link_status = columns.Text(index=True)
|
||||||
link_originality = columns.Float()
|
link_originality = columns.Float()
|
||||||
body_size = columns.Integer()
|
body_size = columns.Integer()
|
||||||
update_time = columns.DateTime()
|
update_time = columns.DateTime()
|
||||||
@ -17,19 +17,17 @@ class Links(Model):
|
|||||||
|
|
||||||
class DailyLinks(Model):
|
class DailyLinks(Model):
|
||||||
__table_name__ = "daily_links"
|
__table_name__ = "daily_links"
|
||||||
day = columns.Integer(primary_key=True)
|
day = columns.Date(primary_key=True)
|
||||||
domain_name = columns.Text(primary_key=True)
|
domain_name = columns.Text(primary_key=True)
|
||||||
|
link_status = columns.Text(primary_key=True)
|
||||||
url_path = columns.Text(primary_key=True)
|
url_path = columns.Text(primary_key=True)
|
||||||
url_query = columns.Text(primary_key=True)
|
url_query = columns.Text(primary_key=True)
|
||||||
url_schema = columns.Text()
|
|
||||||
redirect_target = columns.Text()
|
|
||||||
link_status = columns.Text()
|
|
||||||
link_originality = columns.Float()
|
link_originality = columns.Float()
|
||||||
body_size = columns.Integer()
|
body_size = columns.Integer()
|
||||||
update_time = columns.DateTime()
|
update_time = columns.DateTime()
|
||||||
|
|
||||||
|
|
||||||
class DomainQuality:
|
class DomainQuality(Model):
|
||||||
__table_name__ = "domain_quality"
|
__table_name__ = "domain_quality"
|
||||||
domain_name = columns.Text(primary_key=True)
|
domain_name = columns.Text(primary_key=True)
|
||||||
day = columns.Date(primary_key=True)
|
day = columns.Date(primary_key=True)
|
||||||
@ -47,7 +45,7 @@ class DomainQuality:
|
|||||||
fetched_count = columns.Integer()
|
fetched_count = columns.Integer()
|
||||||
average_fetched_good_characters = columns.Float()
|
average_fetched_good_characters = columns.Float()
|
||||||
gain_ratio = columns.Float()
|
gain_ratio = columns.Float()
|
||||||
update_time = columns.TimeUUID(static=True) #TIMESTAMP STATIC ,
|
update_time = columns.DateTime(static=True) #TIMESTAMP STATIC ,
|
||||||
#) WITH CLUSTERING ORDER BY (day DESC);
|
#) WITH CLUSTERING ORDER BY (day DESC);
|
||||||
|
|
||||||
class Content(Model):
|
class Content(Model):
|
||||||
@ -84,7 +82,7 @@ class Html(Model):
|
|||||||
content = columns.Text()
|
content = columns.Text()
|
||||||
headers = columns.Text()
|
headers = columns.Text()
|
||||||
agent_version = columns.Text()
|
agent_version = columns.Text()
|
||||||
update_time = columns.Text()
|
update_time = columns.DateTime()
|
||||||
|
|
||||||
class DomainConnections(Model):
|
class DomainConnections(Model):
|
||||||
__table_name__ = "domain_connections"
|
__table_name__ = "domain_connections"
|
||||||
@ -92,10 +90,10 @@ class DomainConnections(Model):
|
|||||||
linked_domain = columns.Text(primary_key=True)
|
linked_domain = columns.Text(primary_key=True)
|
||||||
|
|
||||||
def create_database(keyspace,session):
|
def create_database(keyspace,session):
|
||||||
sync_table(Links,keyspaces=[keyspace],connections=[session])
|
sync_table(Links)
|
||||||
sync_table(DailyLinks,keyspaces=[keyspace],connections=[session])
|
sync_table(DailyLinks)
|
||||||
sync_table(DomainQuality,keyspaces=[keyspace],connections=[session])
|
sync_table(DomainQuality)
|
||||||
sync_table(Content,keyspaces=[keyspace],connections=[session])
|
sync_table(Content)
|
||||||
sync_table(ParagraphChecksums,keyspaces=[keyspace],connections=[session])
|
sync_table(ParagraphChecksums)
|
||||||
sync_table(Html,keyspaces=[keyspace],connections=[session])
|
sync_table(Html)
|
||||||
sync_table(DomainConnections,keyspaces=[keyspace],connections=[session])
|
sync_table(DomainConnections)
|
||||||
|
@ -23,9 +23,9 @@ CREATE INDEX link_status_index ON links(link_status);
|
|||||||
CREATE TABLE daily_links (
|
CREATE TABLE daily_links (
|
||||||
day DATE,
|
day DATE,
|
||||||
domain_name TEXT,
|
domain_name TEXT,
|
||||||
|
link_status TEXT,
|
||||||
url_path TEXT,
|
url_path TEXT,
|
||||||
url_query TEXT,
|
url_query TEXT,
|
||||||
link_status TEXT,
|
|
||||||
body_size INT,
|
body_size INT,
|
||||||
link_originality FLOAT,
|
link_originality FLOAT,
|
||||||
update_time TIMESTAMP,
|
update_time TIMESTAMP,
|
||||||
|
Loading…
Reference in New Issue
Block a user