zz
This commit is contained in:
		
							parent
							
								
									135fe2b1f6
								
							
						
					
					
						commit
						30d51944f4
					
				| @ -306,6 +306,7 @@ class ParsedDocument: | |||||||
|         self.article_published_time = None |         self.article_published_time = None | ||||||
|         self.current_time = datetime.date.today() |         self.current_time = datetime.date.today() | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|     def extract(self,content,bs): |     def extract(self,content,bs): | ||||||
|         """ |         """ | ||||||
|         Parse content and fill the object |         Parse content and fill the object | ||||||
| @ -321,7 +322,7 @@ class ParsedDocument: | |||||||
|         self.paragraph_sizes = pszs |         self.paragraph_sizes = pszs | ||||||
|         if bs is  None: |         if bs is  None: | ||||||
|             return |             return | ||||||
|         self.tags,self.authors,self.title,self.article_publilshed_time, self.description,self.section = self.parser.extract_meta(bs) |         self.tags,self.authors,self.title,self.article_published_time, self.description,self.section = self.parser.extract_meta(bs) | ||||||
| 
 | 
 | ||||||
|         # Extrakcia linkov zo stranky |         # Extrakcia linkov zo stranky | ||||||
|         base = self.work_link |         base = self.work_link | ||||||
| @ -361,6 +362,8 @@ class ParsedDocument: | |||||||
| 
 | 
 | ||||||
|     def __str__(self): |     def __str__(self): | ||||||
|         r = [] |         r = [] | ||||||
|  |         if self.authors is not None: | ||||||
|  |             r.append(",".join(self.authors)) | ||||||
|         if self.title is not None: |         if self.title is not None: | ||||||
|             r.append(self.title) |             r.append(self.title) | ||||||
|         if self.body is not None: |         if self.body is not None: | ||||||
|  | |||||||
| @ -193,16 +193,16 @@ def summary(ctx): | |||||||
| @cli.command(help="Create database") | @cli.command(help="Create database") | ||||||
| @click.pass_context | @click.pass_context | ||||||
| @click.argument("replication",default=1) | @click.argument("replication",default=1) | ||||||
| @click.argument("strategy",default="SimpleStrategy") | def create_database(ctx,replication): | ||||||
| def create_database(ctx,replication,strategy): |  | ||||||
|     cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"]) |     cluster = websucker.db.connect_cluster(ctx.obj["cassandra_host"],ctx.obj["cassandra_port"],ctx.obj["cassandra_username"],ctx.obj["cassandra_password"]) | ||||||
|     with cluster.connect() as session: |     with cluster.connect() as session: | ||||||
|         session.execute("DROP keyspace IF EXISTS {}".format(ctx.obj["cassandra_keyspace"])) |         import cassandra.cqlengine.connection | ||||||
|         repo = "{{'class':'{}','replication_factor':{}}}".format(strategy,replication) |         import cassandra.cqlengine.management as man | ||||||
|         query = "CREATE KEYSPACE {} WITH replication = {}".format(ctx.obj["cassandra_keyspace"],repo) |  | ||||||
|         print(query) |  | ||||||
|         session.execute(query) |  | ||||||
|         session.set_keyspace(ctx.obj["cassandra_keyspace"]) |         session.set_keyspace(ctx.obj["cassandra_keyspace"]) | ||||||
|  |         cassandra.cqlengine.connection.set_session(session) | ||||||
|  |         keyspace = ctx.obj["cassandra_keyspace"] | ||||||
|  |         man.drop_keyspace(keyspace) | ||||||
|  |         man.create_keyspace_simple(keyspace,replication) | ||||||
|         websucker.schema.create_database(ctx.obj["cassandra_keyspace"],session) |         websucker.schema.create_database(ctx.obj["cassandra_keyspace"],session) | ||||||
| 
 | 
 | ||||||
| @cli.command(help="Print keyspace schema") | @cli.command(help="Print keyspace schema") | ||||||
|  | |||||||
| @ -9,7 +9,7 @@ class Links(Model): | |||||||
|     url_query = columns.Text(primary_key=True) |     url_query = columns.Text(primary_key=True) | ||||||
|     url_schema = columns.Text() |     url_schema = columns.Text() | ||||||
|     redirect_target = columns.Text() |     redirect_target = columns.Text() | ||||||
|     link_status = columns.Text() |     link_status = columns.Text(index=True) | ||||||
|     link_originality = columns.Float() |     link_originality = columns.Float() | ||||||
|     body_size = columns.Integer() |     body_size = columns.Integer() | ||||||
|     update_time = columns.DateTime() |     update_time = columns.DateTime() | ||||||
| @ -17,19 +17,17 @@ class Links(Model): | |||||||
| 
 | 
 | ||||||
| class DailyLinks(Model): | class DailyLinks(Model): | ||||||
|     __table_name__ = "daily_links" |     __table_name__ = "daily_links" | ||||||
|     day = columns.Integer(primary_key=True) |     day = columns.Date(primary_key=True) | ||||||
|     domain_name = columns.Text(primary_key=True) |     domain_name = columns.Text(primary_key=True) | ||||||
|  |     link_status = columns.Text(primary_key=True) | ||||||
|     url_path = columns.Text(primary_key=True) |     url_path = columns.Text(primary_key=True) | ||||||
|     url_query = columns.Text(primary_key=True) |     url_query = columns.Text(primary_key=True) | ||||||
|     url_schema = columns.Text() |  | ||||||
|     redirect_target = columns.Text() |  | ||||||
|     link_status = columns.Text() |  | ||||||
|     link_originality = columns.Float() |     link_originality = columns.Float() | ||||||
|     body_size = columns.Integer() |     body_size = columns.Integer() | ||||||
|     update_time = columns.DateTime() |     update_time = columns.DateTime() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class DomainQuality: | class DomainQuality(Model): | ||||||
|     __table_name__ = "domain_quality" |     __table_name__ = "domain_quality" | ||||||
|     domain_name = columns.Text(primary_key=True) |     domain_name = columns.Text(primary_key=True) | ||||||
|     day = columns.Date(primary_key=True) |     day = columns.Date(primary_key=True) | ||||||
| @ -47,7 +45,7 @@ class DomainQuality: | |||||||
|     fetched_count = columns.Integer() |     fetched_count = columns.Integer() | ||||||
|     average_fetched_good_characters = columns.Float() |     average_fetched_good_characters = columns.Float() | ||||||
|     gain_ratio = columns.Float() |     gain_ratio = columns.Float() | ||||||
|     update_time = columns.TimeUUID(static=True) #TIMESTAMP STATIC , |     update_time = columns.DateTime(static=True) #TIMESTAMP STATIC , | ||||||
| #) WITH CLUSTERING ORDER BY (day DESC); | #) WITH CLUSTERING ORDER BY (day DESC); | ||||||
| 
 | 
 | ||||||
| class Content(Model): | class Content(Model): | ||||||
| @ -84,7 +82,7 @@ class Html(Model): | |||||||
|     content = columns.Text() |     content = columns.Text() | ||||||
|     headers = columns.Text() |     headers = columns.Text() | ||||||
|     agent_version = columns.Text() |     agent_version = columns.Text() | ||||||
|     update_time = columns.Text() |     update_time = columns.DateTime() | ||||||
| 
 | 
 | ||||||
| class DomainConnections(Model): | class DomainConnections(Model): | ||||||
|     __table_name__ = "domain_connections" |     __table_name__ = "domain_connections" | ||||||
| @ -92,10 +90,10 @@ class DomainConnections(Model): | |||||||
|     linked_domain = columns.Text(primary_key=True) |     linked_domain = columns.Text(primary_key=True) | ||||||
| 
 | 
 | ||||||
| def create_database(keyspace,session): | def create_database(keyspace,session): | ||||||
|     sync_table(Links,keyspaces=[keyspace],connections=[session]) |     sync_table(Links) | ||||||
|     sync_table(DailyLinks,keyspaces=[keyspace],connections=[session]) |     sync_table(DailyLinks) | ||||||
|     sync_table(DomainQuality,keyspaces=[keyspace],connections=[session]) |     sync_table(DomainQuality) | ||||||
|     sync_table(Content,keyspaces=[keyspace],connections=[session]) |     sync_table(Content) | ||||||
|     sync_table(ParagraphChecksums,keyspaces=[keyspace],connections=[session]) |     sync_table(ParagraphChecksums) | ||||||
|     sync_table(Html,keyspaces=[keyspace],connections=[session]) |     sync_table(Html) | ||||||
|     sync_table(DomainConnections,keyspaces=[keyspace],connections=[session]) |     sync_table(DomainConnections) | ||||||
|  | |||||||
| @ -23,9 +23,9 @@ CREATE INDEX link_status_index ON links(link_status); | |||||||
| CREATE TABLE daily_links ( | CREATE TABLE daily_links ( | ||||||
|     day DATE, |     day DATE, | ||||||
|     domain_name TEXT, |     domain_name TEXT, | ||||||
|  |     link_status TEXT, | ||||||
|     url_path TEXT, |     url_path TEXT, | ||||||
|     url_query TEXT, |     url_query TEXT, | ||||||
|     link_status TEXT, |  | ||||||
|     body_size INT, |     body_size INT, | ||||||
|     link_originality FLOAT, |     link_originality FLOAT, | ||||||
|     update_time TIMESTAMP, |     update_time TIMESTAMP, | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user