zz

2020-05-13 15:20:20 +02:00 · 2020-05-13 15:20:20 +02:00 · 52118b38d3
commit 52118b38d3
parent ae760c8e7c
4 changed files with 46 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -1 +1,33 @@
 # Websucker
+
+Agent for Sucking the of Web
+
+## Features
+
+- Crawling of best domains
+- Crawling of unvisited domains
+- Text mining
+- Evaluation of domains
+- Daily report
+- Database Summary
+
+## Requirements
+
+- Python 3
+- running Cassandra 3.11
+- optional Beanstalkd for work queue
+
+## Installation
+
+Activate virtual environment:
+
+    python -m virtualenv ./venv
+    source ./venv/bin/activate
+
+Install package:
+
+    pip install https://git.kemt.fei.tuke.sk/dano/websucker-pip/archive/master.zip
+
+## Usage
+
+    websuck --help
--- a/websucker/agent.py
+++ b/websucker/agent.py
@ -226,7 +226,8 @@ class Connection:
                # 7 Connection refused
                link_status = "bad_connection"
            else:
-                raise e
+                link_status = "bad_connection"
+                #raise e
        except UnicodeDecodeError as e:
            content = None
            link_status = "bad_unicode"
--- a/websucker/cli.py
+++ b/websucker/cli.py
@ -135,6 +135,7 @@ def report(ctx):
        stats = q.stats_tube(ctx.obj["beanstalkd_tube"])
        buried = stats["current-jobs-buried"]
        ready = stats["current-jobs-ready"]
+        print("queue {} at {}:{}".format(ctx.obj["beanstalkd_tube"],ctx.obj["beanstalkd_host"],ctx.obj["beanstalkd_port"]))
        print("{} ready jobs, {} burried jobs".format(ready,buried))
    except Error as err:
        print(err)
--- a/websucker/db.py
+++ b/websucker/db.py
@ -205,7 +205,10 @@ INSERT INTO content(
            domains.append(list(row))
        total_count = 0
        total_size = 0
-        for domain,count in sorted(domains,key=lambda x:x[1]):
+        out = []
+        for domain,count in domains:
+            if count < 2:
+                continue
            total_count += count
            rows = self.session.execute("SELECT link_status,count(link_status),sum(body_size) FROM daily_links WHERE day=toDate(now()) AND domain_name=%s GROUP BY day,domain_name,link_status",(domain,))
            gc = 0
@ -215,7 +218,13 @@ INSERT INTO content(
                    gc = row[1]
                    bs = row[2]
                    total_size += bs
-            print(domain,gc/count,bs,count)
+            out.append((domain,bs,gc/count,count))
+        print("Domain, characters,good ratio,documents")
+        for i,value in enumerate(reversed(sorted(out,key=lambda x: x[3]))):
+            if i < 20:
+                print(value)
+                #print("{},{},{},{}".format(value))
+                
        print("{} domains, {} documents, {} characters ".format(len(domains),total_count,total_size))

    def index_follow_links(self,parser,links,connection):