zz
This commit is contained in:
		
							parent
							
								
									135fe2b1f6
								
							
						
					
					
						commit
						30d51944f4
					
				@ -306,6 +306,7 @@ class ParsedDocument:
 | 
				
			|||||||
        self.article_published_time = None
 | 
					        self.article_published_time = None
 | 
				
			||||||
        self.current_time = datetime.date.today()
 | 
					        self.current_time = datetime.date.today()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def extract(self,content,bs):
 | 
					    def extract(self,content,bs):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Parse content and fill the object
 | 
					        Parse content and fill the object
 | 
				
			||||||
@ -321,7 +322,7 @@ class ParsedDocument:
 | 
				
			|||||||
        self.paragraph_sizes = pszs
 | 
					        self.paragraph_sizes = pszs
 | 
				
			||||||
        if bs is  None:
 | 
					        if bs is  None:
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
        self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs)
 | 
					        self.tags,self.authors,self.title,self.article_published_time, self.description,self.section = self.parser.extract_meta(bs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Extrakcia linkov zo stranky
 | 
					        # Extrakcia linkov zo stranky
 | 
				
			||||||
        base = self.work_link
 | 
					        base = self.work_link
 | 
				
			||||||
@ -361,6 +362,8 @@ class ParsedDocument:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    def __str__(self):
 | 
					    def __str__(self):
 | 
				
			||||||
        r = []
 | 
					        r = []
 | 
				
			||||||
 | 
					        if self.authors is not None:
 | 
				
			||||||
 | 
					            r.append(",".join(self.authors))
 | 
				
			||||||
        if self.title is not None:
 | 
					        if self.title is not None:
 | 
				
			||||||
            r.append(self.title)
 | 
					            r.append(self.title)
 | 
				
			||||||
        if self.body is not None:
 | 
					        if self.body is not None:
 | 
				
			||||||
 | 
				
			|||||||
@ -193,16 +193,16 @@ def summary(ctx):
 | 
				
			|||||||
@cli.command(help="Create database")
 | 
					@cli.command(help="Create database")
 | 
				
			||||||
@click.pass_context
 | 
					@click.pass_context
 | 
				
			||||||
@click.argument("replication",default=1)
 | 
					@click.argument("replication",default=1)
 | 
				
			||||||
@click.argument("strategy",default="SimpleStrategy")
 | 
					def create_database(ctx,replication):
 | 
				
			||||||
def create_database(ctx,replication,strategy):
 | 
					 | 
				
			||||||
    cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"])
 | 
					    cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"])
 | 
				
			||||||
    with cluster.connect() as session:
 | 
					    with cluster.connect() as session:
 | 
				
			||||||
        session.execute("DROP keyspace IF EXISTS {}".format(ctx.obj["cassandra_keyspace"]))
 | 
					        import cassandra.cqlengine.connection
 | 
				
			||||||
        repo = "{{'class':'{}','replication_factor':{}}}".format(strategy,replication)
 | 
					        import cassandra.cqlengine.management as man
 | 
				
			||||||
        query = "CREATE KEYSPACE {} WITH replication = {}".format(ctx.obj["cassandra_keyspace"],repo)
 | 
					 | 
				
			||||||
        print(query)
 | 
					 | 
				
			||||||
        session.execute(query)
 | 
					 | 
				
			||||||
        session.set_keyspace(ctx.obj["cassandra_keyspace"])
 | 
					        session.set_keyspace(ctx.obj["cassandra_keyspace"])
 | 
				
			||||||
 | 
					        cassandra.cqlengine.connection.set_session(session)
 | 
				
			||||||
 | 
					        keyspace = ctx.obj["cassandra_keyspace"]
 | 
				
			||||||
 | 
					        man.drop_keyspace(keyspace)
 | 
				
			||||||
 | 
					        man.create_keyspace_simple(keyspace,replication)
 | 
				
			||||||
        websucker.schema.create_database(ctx.obj["cassandra_keyspace"],session)
 | 
					        websucker.schema.create_database(ctx.obj["cassandra_keyspace"],session)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@cli.command(help="Print keyspace schema")
 | 
					@cli.command(help="Print keyspace schema")
 | 
				
			||||||
 | 
				
			|||||||
@ -9,7 +9,7 @@ class Links(Model):
 | 
				
			|||||||
    url_query = columns.Text(primary_key=True)
 | 
					    url_query = columns.Text(primary_key=True)
 | 
				
			||||||
    url_schema = columns.Text()
 | 
					    url_schema = columns.Text()
 | 
				
			||||||
    redirect_target = columns.Text()
 | 
					    redirect_target = columns.Text()
 | 
				
			||||||
    link_status = columns.Text()
 | 
					    link_status = columns.Text(index=True)
 | 
				
			||||||
    link_originality = columns.Float()
 | 
					    link_originality = columns.Float()
 | 
				
			||||||
    body_size = columns.Integer()
 | 
					    body_size = columns.Integer()
 | 
				
			||||||
    update_time = columns.DateTime()
 | 
					    update_time = columns.DateTime()
 | 
				
			||||||
@ -17,19 +17,17 @@ class Links(Model):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
class DailyLinks(Model):
 | 
					class DailyLinks(Model):
 | 
				
			||||||
    __table_name__ = "daily_links"
 | 
					    __table_name__ = "daily_links"
 | 
				
			||||||
    day = columns.Integer(primary_key=True)
 | 
					    day = columns.Date(primary_key=True)
 | 
				
			||||||
    domain_name = columns.Text(primary_key=True)
 | 
					    domain_name = columns.Text(primary_key=True)
 | 
				
			||||||
 | 
					    link_status = columns.Text(primary_key=True)
 | 
				
			||||||
    url_path = columns.Text(primary_key=True)
 | 
					    url_path = columns.Text(primary_key=True)
 | 
				
			||||||
    url_query = columns.Text(primary_key=True)
 | 
					    url_query = columns.Text(primary_key=True)
 | 
				
			||||||
    url_schema = columns.Text()
 | 
					 | 
				
			||||||
    redirect_target = columns.Text()
 | 
					 | 
				
			||||||
    link_status = columns.Text()
 | 
					 | 
				
			||||||
    link_originality = columns.Float()
 | 
					    link_originality = columns.Float()
 | 
				
			||||||
    body_size = columns.Integer()
 | 
					    body_size = columns.Integer()
 | 
				
			||||||
    update_time = columns.DateTime()
 | 
					    update_time = columns.DateTime()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DomainQuality:
 | 
					class DomainQuality(Model):
 | 
				
			||||||
    __table_name__ = "domain_quality"
 | 
					    __table_name__ = "domain_quality"
 | 
				
			||||||
    domain_name = columns.Text(primary_key=True)
 | 
					    domain_name = columns.Text(primary_key=True)
 | 
				
			||||||
    day = columns.Date(primary_key=True)
 | 
					    day = columns.Date(primary_key=True)
 | 
				
			||||||
@ -47,7 +45,7 @@ class DomainQuality:
 | 
				
			|||||||
    fetched_count = columns.Integer()
 | 
					    fetched_count = columns.Integer()
 | 
				
			||||||
    average_fetched_good_characters = columns.Float()
 | 
					    average_fetched_good_characters = columns.Float()
 | 
				
			||||||
    gain_ratio = columns.Float()
 | 
					    gain_ratio = columns.Float()
 | 
				
			||||||
    update_time = columns.TimeUUID(static=True) #TIMESTAMP STATIC ,
 | 
					    update_time = columns.DateTime(static=True) #TIMESTAMP STATIC ,
 | 
				
			||||||
#) WITH CLUSTERING ORDER BY (day DESC);
 | 
					#) WITH CLUSTERING ORDER BY (day DESC);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Content(Model):
 | 
					class Content(Model):
 | 
				
			||||||
@ -84,7 +82,7 @@ class Html(Model):
 | 
				
			|||||||
    content = columns.Text()
 | 
					    content = columns.Text()
 | 
				
			||||||
    headers = columns.Text()
 | 
					    headers = columns.Text()
 | 
				
			||||||
    agent_version = columns.Text()
 | 
					    agent_version = columns.Text()
 | 
				
			||||||
    update_time = columns.Text()
 | 
					    update_time = columns.DateTime()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DomainConnections(Model):
 | 
					class DomainConnections(Model):
 | 
				
			||||||
    __table_name__ = "domain_connections"
 | 
					    __table_name__ = "domain_connections"
 | 
				
			||||||
@ -92,10 +90,10 @@ class DomainConnections(Model):
 | 
				
			|||||||
    linked_domain = columns.Text(primary_key=True)
 | 
					    linked_domain = columns.Text(primary_key=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def create_database(keyspace,session):
 | 
					def create_database(keyspace,session):
 | 
				
			||||||
    sync_table(Links,keyspaces=[keyspace],connections=[session])
 | 
					    sync_table(Links)
 | 
				
			||||||
    sync_table(DailyLinks,keyspaces=[keyspace],connections=[session])
 | 
					    sync_table(DailyLinks)
 | 
				
			||||||
    sync_table(DomainQuality,keyspaces=[keyspace],connections=[session])
 | 
					    sync_table(DomainQuality)
 | 
				
			||||||
    sync_table(Content,keyspaces=[keyspace],connections=[session])
 | 
					    sync_table(Content)
 | 
				
			||||||
    sync_table(ParagraphChecksums,keyspaces=[keyspace],connections=[session])
 | 
					    sync_table(ParagraphChecksums)
 | 
				
			||||||
    sync_table(Html,keyspaces=[keyspace],connections=[session])
 | 
					    sync_table(Html)
 | 
				
			||||||
    sync_table(DomainConnections,keyspaces=[keyspace],connections=[session])
 | 
					    sync_table(DomainConnections)
 | 
				
			||||||
 | 
				
			|||||||
@ -23,9 +23,9 @@ CREATE INDEX link_status_index ON links(link_status);
 | 
				
			|||||||
CREATE TABLE daily_links (
 | 
					CREATE TABLE daily_links (
 | 
				
			||||||
    day DATE,
 | 
					    day DATE,
 | 
				
			||||||
    domain_name TEXT,
 | 
					    domain_name TEXT,
 | 
				
			||||||
 | 
					    link_status TEXT,
 | 
				
			||||||
    url_path TEXT,
 | 
					    url_path TEXT,
 | 
				
			||||||
    url_query TEXT,
 | 
					    url_query TEXT,
 | 
				
			||||||
    link_status TEXT,
 | 
					 | 
				
			||||||
    body_size INT,
 | 
					    body_size INT,
 | 
				
			||||||
    link_originality FLOAT,
 | 
					    link_originality FLOAT,
 | 
				
			||||||
    update_time TIMESTAMP,
 | 
					    update_time TIMESTAMP,
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user