This commit is contained in:
Daniel Hládek 2023-02-26 14:10:58 +01:00
parent a22fa87537
commit 5610dc0f87
4 changed files with 22 additions and 7 deletions

View File

@ -1,16 +1,16 @@
FROM python:3.8 FROM debian:11
RUN apt-get update && apt-get install -y git curl libcurl4-openssl-dev build-essential vim libssl-dev python-pip RUN apt-get update && apt-get install -y git curl libcurl4-openssl-dev build-essential vim libssl-dev python2 python-dev wget python3 python3-pip
RUN wget https://bootstrap.pypa.io/pip/2.7/get-pip.py && python2 get-pip.py
RUN pip2 install cqlsh RUN python2 -m pip install cqlsh
RUN addgroup appgroup && \ RUN addgroup appgroup && \
adduser appuser && adduser appuser appgroup adduser appuser && adduser appuser appgroup
RUN mkdir /app /src RUN mkdir /app /src
ADD requirements.txt /src/ ADD requirements.txt /src/
RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip install -r /src/requirements.txt RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip3 install -r /src/requirements.txt
WORKDIR /src WORKDIR /src
ADD . /src ADD . /src
RUN python /src/setup.py install RUN pip3 install /src/
WORKDIR /app WORKDIR /app

1
build-docker.sh Executable file
View File

@ -0,0 +1 @@
docker build . -t dr.kemt.fei.tuke.sk/websucker:dev

View File

@ -154,6 +154,13 @@ def check(ctx,domain):
res = db.check_domain(domain) res = db.check_domain(domain)
print(res) print(res)
@cli.command(help="Export domain as JSON doc per line")
@click.pass_context
@click.argument("domain")
def tojson(ctx,domain):
db = create_database_from_context(ctx)
db.export_domain(domain)
@cli.command(help="Print daily report") @cli.command(help="Print daily report")
@click.pass_context @click.pass_context
def report(ctx): def report(ctx):

View File

@ -1,5 +1,6 @@
import cassandra import cassandra
import cassandra.cluster import cassandra.cluster
import cassandra.query
import random import random
import os import os
import pkg_resources import pkg_resources
@ -7,6 +8,7 @@ import datetime
from websucker.parser import normalize_link,urlunparse from websucker.parser import normalize_link,urlunparse
import collections import collections
import math import math
import json
VERSION = "sucker6" VERSION = "sucker6"
@ -21,7 +23,7 @@ class Data:
Database of text documents Database of text documents
""" """
def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042): def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042):
print("Database {}@{}:{}".format(keyspace,cassandra_host, cassandra_port)) #print("Database {}@{}:{}".format(keyspace,cassandra_host, cassandra_port))
# execution profile # execution profile
ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0) ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0)
profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep} profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep}
@ -539,6 +541,11 @@ INSERT INTO content(
result.append((d,0)) result.append((d,0))
return result return result
def export_domain(self,domain):
rows = self.session.execute("SELECT JSON * from content WHERE domain_name=%s",(domain,))
for row in rows:
print(row[0])
def get_visit_links(self,domain,recent_count,old_count,random_count): def get_visit_links(self,domain,recent_count,old_count,random_count):
dblinks = [] dblinks = []
rows = self.session.execute("SELECT url_schema,url_path,url_query,update_time FROM links Where domain_name=%s AND link_status='seen'",(domain,)) rows = self.session.execute("SELECT url_schema,url_path,url_query,update_time FROM links Where domain_name=%s AND link_status='seen'",(domain,))