This commit is contained in:
Daniel Hladek 2020-05-13 15:20:20 +02:00
parent ae760c8e7c
commit 52118b38d3
4 changed files with 46 additions and 3 deletions

View File

@ -1 +1,33 @@
# Websucker # Websucker
Agent for Sucking the of Web
## Features
- Crawling of best domains
- Crawling of unvisited domains
- Text mining
- Evaluation of domains
- Daily report
- Database Summary
## Requirements
- Python 3
- running Cassandra 3.11
- optional Beanstalkd for work queue
## Installation
Activate virtual environment:
python -m virtualenv ./venv
source ./venv/bin/activate
Install package:
pip install https://git.kemt.fei.tuke.sk/dano/websucker-pip/archive/master.zip
## Usage
websuck --help

View File

@ -226,7 +226,8 @@ class Connection:
# 7 Connection refused # 7 Connection refused
link_status = "bad_connection" link_status = "bad_connection"
else: else:
raise e link_status = "bad_connection"
#raise e
except UnicodeDecodeError as e: except UnicodeDecodeError as e:
content = None content = None
link_status = "bad_unicode" link_status = "bad_unicode"

View File

@ -135,6 +135,7 @@ def report(ctx):
stats = q.stats_tube(ctx.obj["beanstalkd_tube"]) stats = q.stats_tube(ctx.obj["beanstalkd_tube"])
buried = stats["current-jobs-buried"] buried = stats["current-jobs-buried"]
ready = stats["current-jobs-ready"] ready = stats["current-jobs-ready"]
print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]))
print("{} ready jobs, {} burried jobs".format(ready,buried)) print("{} ready jobs, {} burried jobs".format(ready,buried))
except Error as err: except Error as err:
print(err) print(err)

View File

@ -205,7 +205,10 @@ INSERT INTO content(
domains.append(list(row)) domains.append(list(row))
total_count = 0 total_count = 0
total_size = 0 total_size = 0
for domain,count in sorted(domains,key=lambda x:x[1]): out = []
for domain,count in domains:
if count < 2:
continue
total_count += count total_count += count
rows = self.session.execute("SELECT link_status,count(link_status),sum(body_size) FROM daily_links WHERE day=toDate(now()) AND domain_name=%s GROUP BY day,domain_name,link_status",(domain,)) rows = self.session.execute("SELECT link_status,count(link_status),sum(body_size) FROM daily_links WHERE day=toDate(now()) AND domain_name=%s GROUP BY day,domain_name,link_status",(domain,))
gc = 0 gc = 0
@ -215,7 +218,13 @@ INSERT INTO content(
gc = row[1] gc = row[1]
bs = row[2] bs = row[2]
total_size += bs total_size += bs
print(domain,gc/count,bs,count) out.append((domain,bs,gc/count,count))
print("Domain, characters,good ratio,documents")
for i,value in enumerate(reversed(sorted(out,key=lambda x: x[3]))):
if i < 20:
print(value)
#print("{},{},{},{}".format(value))
print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size)) print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size))
def index_follow_links(self,parser,links,connection): def index_follow_links(self,parser,links,connection):