zz
This commit is contained in:
parent
ae760c8e7c
commit
52118b38d3
32
README.md
32
README.md
@ -1 +1,33 @@
|
|||||||
# Websucker
|
# Websucker
|
||||||
|
|
||||||
|
Agent for Sucking the of Web
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Crawling of best domains
|
||||||
|
- Crawling of unvisited domains
|
||||||
|
- Text mining
|
||||||
|
- Evaluation of domains
|
||||||
|
- Daily report
|
||||||
|
- Database Summary
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Python 3
|
||||||
|
- running Cassandra 3.11
|
||||||
|
- optional Beanstalkd for work queue
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
Activate virtual environment:
|
||||||
|
|
||||||
|
python -m virtualenv ./venv
|
||||||
|
source ./venv/bin/activate
|
||||||
|
|
||||||
|
Install package:
|
||||||
|
|
||||||
|
pip install https://git.kemt.fei.tuke.sk/dano/websucker-pip/archive/master.zip
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
websuck --help
|
||||||
|
@ -226,7 +226,8 @@ class Connection:
|
|||||||
# 7 Connection refused
|
# 7 Connection refused
|
||||||
link_status = "bad_connection"
|
link_status = "bad_connection"
|
||||||
else:
|
else:
|
||||||
raise e
|
link_status = "bad_connection"
|
||||||
|
#raise e
|
||||||
except UnicodeDecodeError as e:
|
except UnicodeDecodeError as e:
|
||||||
content = None
|
content = None
|
||||||
link_status = "bad_unicode"
|
link_status = "bad_unicode"
|
||||||
|
@ -135,6 +135,7 @@ def report(ctx):
|
|||||||
stats = q.stats_tube(ctx.obj["beanstalkd_tube"])
|
stats = q.stats_tube(ctx.obj["beanstalkd_tube"])
|
||||||
buried = stats["current-jobs-buried"]
|
buried = stats["current-jobs-buried"]
|
||||||
ready = stats["current-jobs-ready"]
|
ready = stats["current-jobs-ready"]
|
||||||
|
print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]))
|
||||||
print("{} ready jobs, {} burried jobs".format(ready,buried))
|
print("{} ready jobs, {} burried jobs".format(ready,buried))
|
||||||
except Error as err:
|
except Error as err:
|
||||||
print(err)
|
print(err)
|
||||||
|
@ -205,7 +205,10 @@ INSERT INTO content(
|
|||||||
domains.append(list(row))
|
domains.append(list(row))
|
||||||
total_count = 0
|
total_count = 0
|
||||||
total_size = 0
|
total_size = 0
|
||||||
for domain,count in sorted(domains,key=lambda x:x[1]):
|
out = []
|
||||||
|
for domain,count in domains:
|
||||||
|
if count < 2:
|
||||||
|
continue
|
||||||
total_count += count
|
total_count += count
|
||||||
rows = self.session.execute("SELECT link_status,count(link_status),sum(body_size) FROM daily_links WHERE day=toDate(now()) AND domain_name=%s GROUP BY day,domain_name,link_status",(domain,))
|
rows = self.session.execute("SELECT link_status,count(link_status),sum(body_size) FROM daily_links WHERE day=toDate(now()) AND domain_name=%s GROUP BY day,domain_name,link_status",(domain,))
|
||||||
gc = 0
|
gc = 0
|
||||||
@ -215,7 +218,13 @@ INSERT INTO content(
|
|||||||
gc = row[1]
|
gc = row[1]
|
||||||
bs = row[2]
|
bs = row[2]
|
||||||
total_size += bs
|
total_size += bs
|
||||||
print(domain,gc/count,bs,count)
|
out.append((domain,bs,gc/count,count))
|
||||||
|
print("Domain, characters,good ratio,documents")
|
||||||
|
for i,value in enumerate(reversed(sorted(out,key=lambda x: x[3]))):
|
||||||
|
if i < 20:
|
||||||
|
print(value)
|
||||||
|
#print("{},{},{},{}".format(value))
|
||||||
|
|
||||||
print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size))
|
print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size))
|
||||||
|
|
||||||
def index_follow_links(self,parser,links,connection):
|
def index_follow_links(self,parser,links,connection):
|
||||||
|
Loading…
Reference in New Issue
Block a user