zz
This commit is contained in:
parent
a22fa87537
commit
5610dc0f87
12
Dockerfile
12
Dockerfile
@ -1,16 +1,16 @@
|
|||||||
FROM python:3.8
|
FROM debian:11
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y git curl libcurl4-openssl-dev build-essential vim libssl-dev python-pip
|
RUN apt-get update && apt-get install -y git curl libcurl4-openssl-dev build-essential vim libssl-dev python2 python-dev wget python3 python3-pip
|
||||||
|
RUN wget https://bootstrap.pypa.io/pip/2.7/get-pip.py && python2 get-pip.py
|
||||||
RUN pip2 install cqlsh
|
RUN python2 -m pip install cqlsh
|
||||||
|
|
||||||
RUN addgroup appgroup && \
|
RUN addgroup appgroup && \
|
||||||
adduser appuser && adduser appuser appgroup
|
adduser appuser && adduser appuser appgroup
|
||||||
|
|
||||||
RUN mkdir /app /src
|
RUN mkdir /app /src
|
||||||
ADD requirements.txt /src/
|
ADD requirements.txt /src/
|
||||||
RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip install -r /src/requirements.txt
|
RUN CASS_DRIVER_BUILD_CONCURRENCY=4 pip3 install -r /src/requirements.txt
|
||||||
WORKDIR /src
|
WORKDIR /src
|
||||||
ADD . /src
|
ADD . /src
|
||||||
RUN python /src/setup.py install
|
RUN pip3 install /src/
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
1
build-docker.sh
Executable file
1
build-docker.sh
Executable file
@ -0,0 +1 @@
|
|||||||
|
docker build . -t dr.kemt.fei.tuke.sk/websucker:dev
|
@ -154,6 +154,13 @@ def check(ctx,domain):
|
|||||||
res = db.check_domain(domain)
|
res = db.check_domain(domain)
|
||||||
print(res)
|
print(res)
|
||||||
|
|
||||||
|
@cli.command(help="Export domain as JSON doc per line")
|
||||||
|
@click.pass_context
|
||||||
|
@click.argument("domain")
|
||||||
|
def tojson(ctx,domain):
|
||||||
|
db = create_database_from_context(ctx)
|
||||||
|
db.export_domain(domain)
|
||||||
|
|
||||||
@cli.command(help="Print daily report")
|
@cli.command(help="Print daily report")
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def report(ctx):
|
def report(ctx):
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import cassandra
|
import cassandra
|
||||||
import cassandra.cluster
|
import cassandra.cluster
|
||||||
|
import cassandra.query
|
||||||
import random
|
import random
|
||||||
import os
|
import os
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
@ -7,6 +8,7 @@ import datetime
|
|||||||
from websucker.parser import normalize_link,urlunparse
|
from websucker.parser import normalize_link,urlunparse
|
||||||
import collections
|
import collections
|
||||||
import math
|
import math
|
||||||
|
import json
|
||||||
|
|
||||||
VERSION = "sucker6"
|
VERSION = "sucker6"
|
||||||
|
|
||||||
@ -21,7 +23,7 @@ class Data:
|
|||||||
Database of text documents
|
Database of text documents
|
||||||
"""
|
"""
|
||||||
def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042):
|
def __init__(self,keyspace="websucker",cassandra_host="127.0.0.1",cassandra_port=9042):
|
||||||
print("Database {}@{}:{}".format(keyspace,cassandra_host, cassandra_port))
|
#print("Database {}@{}:{}".format(keyspace,cassandra_host, cassandra_port))
|
||||||
# execution profile
|
# execution profile
|
||||||
ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0)
|
ep = cassandra.cluster.ExecutionProfile(request_timeout=240.0)
|
||||||
profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep}
|
profiles = {cassandra.cluster.EXEC_PROFILE_DEFAULT:ep}
|
||||||
@ -538,6 +540,11 @@ INSERT INTO content(
|
|||||||
if r < link_weight:
|
if r < link_weight:
|
||||||
result.append((d,0))
|
result.append((d,0))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def export_domain(self,domain):
|
||||||
|
rows = self.session.execute("SELECT JSON * from content WHERE domain_name=%s",(domain,))
|
||||||
|
for row in rows:
|
||||||
|
print(row[0])
|
||||||
|
|
||||||
def get_visit_links(self,domain,recent_count,old_count,random_count):
|
def get_visit_links(self,domain,recent_count,old_count,random_count):
|
||||||
dblinks = []
|
dblinks = []
|
||||||
|
Loading…
Reference in New Issue
Block a user