diff --git a/Dockerfile b/Dockerfile index 12232fc..f789c39 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,16 @@ -FROM python:3.8 +FROM debian:11 -RUN apt-get update && apt-get install -y git curl libcurl4-openssl-dev build-essential vim libssl-dev python-pip - -RUN pip2 install cqlsh +RUN apt-get update && apt-get install -y git curl libcurl4-openssl-dev build-essential vim libssl-dev python2 python-dev wget python3 python3-pip +RUN wget https://bootstrap.pypa.io/pip/2.7/get-pip.py && python2 get-pip.py +RUN python2 -m pip install cqlsh RUN addgroup appgroup && \ adduser appuser && adduser appuser appgroup RUN mkdir /app /src ADD requirements.txt /src/ -RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip install -r /src/requirements.txt +RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip3 install -r /src/requirements.txt WORKDIR /src ADD . /src -RUN python /src/setup.py install +RUN pip3 install /src/ WORKDIR /app diff --git a/build-docker.sh b/build-docker.sh new file mode 100755 index 0000000..371932a --- /dev/null +++ b/build-docker.sh @@ -0,0 +1 @@ +docker build . -t dr.kemt.fei.tuke.sk/websucker:dev diff --git a/websucker/cli.py b/websucker/cli.py index db149c4..ff33dd0 100644 --- a/websucker/cli.py +++ b/websucker/cli.py @@ -154,6 +154,13 @@ def check(ctx,domain): res = db.check_domain(domain) print(res) +@cli.command(help="Export domain as JSON doc per line") +@click.pass_context +@click.argument("domain") +def tojson(ctx,domain): + db = create_database_from_context(ctx) + db.export_domain(domain) + @cli.command(help="Print daily report") @click.pass_context def report(ctx): diff --git a/websucker/db.py b/websucker/db.py index 03f51c2..aba0a20 100644 --- a/websucker/db.py +++ b/websucker/db.py @@ -1,5 +1,6 @@ import cassandra import cassandra.cluster +import cassandra.query import random import os import pkg_resources @@ -7,6 +8,7 @@ import datetime from websucker.parser import normalize_link,urlunparse import collections import math +import json VERSION = "sucker6" @@ -21,7 +23,7 @@ class Data: Database of text documents """ def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042): - print("Database {}@{}:{}".format(keyspace,cassandra_host, cassandra_port)) + #print("Database {}@{}:{}".format(keyspace,cassandra_host, cassandra_port)) # execution profile ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0) profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep} @@ -538,6 +540,11 @@ INSERT INTO content( if r < link_weight: result.append((d,0)) return result + + def export_domain(self,domain): + rows = self.session.execute("SELECT JSON * from content WHERE domain_name=%s",(domain,)) + for row in rows: + print(row[0]) def get_visit_links(self,domain,recent_count,old_count,random_count): dblinks = []