zz
This commit is contained in:
parent
ae760c8e7c
commit
52118b38d3
32
README.md
32
README.md
@ -1 +1,33 @@
|
||||
# Websucker
|
||||
|
||||
Agent for Sucking the of Web
|
||||
|
||||
## Features
|
||||
|
||||
- Crawling of best domains
|
||||
- Crawling of unvisited domains
|
||||
- Text mining
|
||||
- Evaluation of domains
|
||||
- Daily report
|
||||
- Database Summary
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3
|
||||
- running Cassandra 3.11
|
||||
- optional Beanstalkd for work queue
|
||||
|
||||
## Installation
|
||||
|
||||
Activate virtual environment:
|
||||
|
||||
python -m virtualenv ./venv
|
||||
source ./venv/bin/activate
|
||||
|
||||
Install package:
|
||||
|
||||
pip install https://git.kemt.fei.tuke.sk/dano/websucker-pip/archive/master.zip
|
||||
|
||||
## Usage
|
||||
|
||||
websuck --help
|
||||
|
@ -226,7 +226,8 @@ class Connection:
|
||||
# 7 Connection refused
|
||||
link_status = "bad_connection"
|
||||
else:
|
||||
raise e
|
||||
link_status = "bad_connection"
|
||||
#raise e
|
||||
except UnicodeDecodeError as e:
|
||||
content = None
|
||||
link_status = "bad_unicode"
|
||||
|
@ -135,6 +135,7 @@ def report(ctx):
|
||||
stats = q.stats_tube(ctx.obj["beanstalkd_tube"])
|
||||
buried = stats["current-jobs-buried"]
|
||||
ready = stats["current-jobs-ready"]
|
||||
print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]))
|
||||
print("{} ready jobs, {} burried jobs".format(ready,buried))
|
||||
except Error as err:
|
||||
print(err)
|
||||
|
@ -205,7 +205,10 @@ INSERT INTO content(
|
||||
domains.append(list(row))
|
||||
total_count = 0
|
||||
total_size = 0
|
||||
for domain,count in sorted(domains,key=lambda x:x[1]):
|
||||
out = []
|
||||
for domain,count in domains:
|
||||
if count < 2:
|
||||
continue
|
||||
total_count += count
|
||||
rows = self.session.execute("SELECT link_status,count(link_status),sum(body_size) FROM daily_links WHERE day=toDate(now()) AND domain_name=%s GROUP BY day,domain_name,link_status",(domain,))
|
||||
gc = 0
|
||||
@ -215,7 +218,13 @@ INSERT INTO content(
|
||||
gc = row[1]
|
||||
bs = row[2]
|
||||
total_size += bs
|
||||
print(domain,gc/count,bs,count)
|
||||
out.append((domain,bs,gc/count,count))
|
||||
print("Domain, characters,good ratio,documents")
|
||||
for i,value in enumerate(reversed(sorted(out,key=lambda x: x[3]))):
|
||||
if i < 20:
|
||||
print(value)
|
||||
#print("{},{},{},{}".format(value))
|
||||
|
||||
print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size))
|
||||
|
||||
def index_follow_links(self,parser,links,connection):
|
||||
|
Loading…
Reference in New Issue
Block a user