2020-05-09 09:50:50 +00:00
from websucker . agent import Connection , visit_links , visit_domain , process_domains , work_domains
2020-05-07 14:09:45 +00:00
from websucker . parser import BaseParser
2023-03-06 15:29:30 +00:00
from websucker . parser import SoupParser
from websucker . parser import TrafilaturaParser
2020-05-08 05:53:50 +00:00
from websucker . parser import load_parser
2020-05-07 14:09:45 +00:00
from websucker . db import Data
from websucker . db import get_schema
2023-02-28 07:56:35 +00:00
import websucker . db
2020-05-07 14:09:45 +00:00
import click
import pprint
2020-05-09 09:50:50 +00:00
import greenstalk
import os
2020-05-07 14:09:45 +00:00
2023-02-28 11:55:44 +00:00
import websucker . schema
2023-02-28 07:56:35 +00:00
2020-05-07 14:09:45 +00:00
def create_database_from_context ( ctx ) :
2023-02-23 15:08:02 +00:00
return Data ( ctx . obj [ " cassandra_keyspace " ] , ctx . obj [ " cassandra_host " ] , ctx . obj [ " cassandra_port " ] , ctx . obj [ " cassandra_username " ] , ctx . obj [ " cassandra_password " ] )
2020-05-07 14:09:45 +00:00
2020-05-09 09:50:50 +00:00
def create_queue_from_context ( ctx ) :
2021-01-20 11:34:38 +00:00
return greenstalk . Client ( ( ctx . obj [ " beanstalkd_host " ] , ctx . obj [ " beanstalkd_port " ] ) , use = ctx . obj [ " beanstalkd_tube " ] , watch = ctx . obj [ " beanstalkd_tube " ] , encoding = " utf8 " )
2020-05-09 09:50:50 +00:00
2020-05-07 14:09:45 +00:00
@click.group ( )
@click.pass_context
@click.option ( " --cassandra-keyspace " , metavar = " CASSANDRA_KEYSPACE " , help = " cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable) " , envvar = " CASSANDRA_KEYSPACE " , default = " websucker " , show_default = True )
@click.option ( " --cassandra-host " , metavar = " CASSANDRA_HOST " , help = " cassandra host (if defined, value read from CASSANDRA_HOST env variable) " , envvar = " CASSANDRA_HOST " , default = " 127.0.0.1 " , show_default = True )
@click.option ( " --cassandra-port " , metavar = " CASSANDRA_PORT " , help = " cassandra port (if defined, value read from CASSANDRA_PORT env variable) " , envvar = " CASSANDRA_PORT " , default = 9042 , show_default = True )
2023-02-28 07:56:35 +00:00
@click.option ( " --cassandra-username " , metavar = " CASSANDRA_USERNAME " , help = " cassandra username (if defined, value read from CASSANDRA_USERNAME env variable) " , envvar = " CASSANDRA_USERNAME " , default = " cassandra " , show_default = True )
@click.option ( " --cassandra-password " , metavar = " CASSANDRA_PASSWORD " , help = " cassandra password (if defined, value read from CASSANDRA_PASSWORD env variable) " , envvar = " CASSANDRA_PASSWORD " , default = " cassandra " , show_default = True )
2020-05-09 09:50:50 +00:00
@click.option ( " --beanstalkd-tube " , metavar = " BEANSTALKD_TUBE " , help = " beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable) " , envvar = " BEANSTALKD_TUBE " , default = " websucker " , show_default = True )
@click.option ( " --beanstalkd-host " , metavar = " BEANSTALKD_HOST " , help = " beanstalkd host (if defined, value read from beanstalkd_HOST env variable) " , envvar = " BEANSTALKD_HOST " , default = " 127.0.0.1 " , show_default = True )
@click.option ( " --beanstalkd-port " , metavar = " BEANSTALKD_PORT " , help = " beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable) " , envvar = " BEANSTALKD_PORT " , default = 11300 , show_default = True )
2020-05-08 05:53:50 +00:00
@click.option ( " --parser " , metavar = " file_name " , help = " zzz " )
2020-05-07 14:09:45 +00:00
@click.option ( " --visit " , is_flag = True )
2020-05-09 09:50:50 +00:00
@click.option ( " --queue " , is_flag = True )
2023-02-23 15:08:02 +00:00
def cli ( ctx , cassandra_keyspace , cassandra_host , cassandra_port , cassandra_username , cassandra_password , beanstalkd_tube , beanstalkd_host , beanstalkd_port , parser , visit , queue ) :
2020-05-07 14:09:45 +00:00
ctx . ensure_object ( dict )
2023-03-06 15:29:30 +00:00
p = TrafilaturaParser ( )
2020-05-10 09:48:17 +00:00
if parser is not None :
assert os . path . isfile ( parser )
else :
suckerfile = os . getcwd ( ) + " /Suckerfile.py "
if os . path . isfile ( suckerfile ) :
parser = suckerfile
2020-05-08 05:53:50 +00:00
if parser is not None :
p = load_parser ( parser )
assert p is not None
2020-05-07 14:09:45 +00:00
ctx . obj [ " parser " ] = p
ctx . obj [ " cassandra_host " ] = cassandra_host
ctx . obj [ " cassandra_port " ] = cassandra_port
2023-02-23 15:08:02 +00:00
ctx . obj [ " cassandra_username " ] = cassandra_username
ctx . obj [ " cassandra_password " ] = cassandra_password
2020-05-07 14:09:45 +00:00
ctx . obj [ " cassandra_keyspace " ] = cassandra_keyspace
2020-05-09 09:50:50 +00:00
ctx . obj [ " beanstalkd_host " ] = beanstalkd_host
ctx . obj [ " beanstalkd_port " ] = beanstalkd_port
ctx . obj [ " beanstalkd_tube " ] = beanstalkd_tube
2020-05-07 14:09:45 +00:00
ctx . obj [ " visit " ] = visit
2020-05-09 09:50:50 +00:00
ctx . obj [ " queue " ] = queue
2020-05-07 14:09:45 +00:00
2021-01-21 09:47:29 +00:00
@cli.command ( help = " Get visited domains from db " )
2020-05-07 14:09:45 +00:00
@click.pass_context
@click.argument ( " count " , type = int , default = 20 )
def all ( ctx , count ) :
db = create_database_from_context ( ctx )
res = db . all_domains ( count )
2020-05-09 09:50:50 +00:00
q = None
if ctx . obj [ " queue " ] :
q = create_queue_from_context ( ctx )
process_domains ( res , ctx . obj [ " visit " ] , ctx . obj [ " parser " ] , db , q )
2020-05-07 14:09:45 +00:00
2021-01-21 09:47:29 +00:00
@cli.command ( help = " Get random domains " )
2020-05-07 14:09:45 +00:00
@click.pass_context
@click.argument ( " count " , type = int , default = 20 )
#@click.option("visit",is_flag=True)
2021-01-21 09:47:29 +00:00
def blind ( ctx , count ) :
2020-05-07 14:09:45 +00:00
db = create_database_from_context ( ctx )
2020-05-20 07:22:19 +00:00
p = ctx . obj [ " parser " ]
2021-01-21 09:47:29 +00:00
domains = db . get_random_domains ( count , p )
2020-05-09 09:50:50 +00:00
q = None
if ctx . obj [ " queue " ] :
q = create_queue_from_context ( ctx )
2020-05-20 07:22:19 +00:00
process_domains ( domains , ctx . obj [ " visit " ] , p , db , q )
2020-05-07 14:09:45 +00:00
2021-01-21 09:47:29 +00:00
@cli.command ( help = " Visit domains from queue " )
@click.pass_context
def work ( ctx ) :
db = create_database_from_context ( ctx )
q = create_queue_from_context ( ctx )
work_domains ( ctx . obj [ " parser " ] , db , q )
@cli.command ( help = " Get best domains from db " )
2020-07-04 07:34:31 +00:00
@click.pass_context
@click.argument ( " count " , type = int , default = 20 )
#@click.option("visit",is_flag=True)
2021-01-21 09:47:29 +00:00
def best ( ctx , count ) :
2020-07-04 07:34:31 +00:00
db = create_database_from_context ( ctx )
p = ctx . obj [ " parser " ]
2021-01-21 09:47:29 +00:00
domains = db . get_best_domains ( count , p )
2020-07-04 07:34:31 +00:00
q = None
if ctx . obj [ " queue " ] :
q = create_queue_from_context ( ctx )
process_domains ( domains , ctx . obj [ " visit " ] , p , db , q )
2020-05-07 14:09:45 +00:00
2021-01-21 09:47:29 +00:00
@cli.command ( help = " Get unvisited domains " )
2020-05-07 14:09:45 +00:00
@click.pass_context
@click.argument ( " count " , type = int , default = 20 )
def unvisited ( ctx , count ) :
db = create_database_from_context ( ctx )
2020-05-20 07:22:19 +00:00
p = ctx . obj [ " parser " ]
domains = db . get_unvisited_domains ( count , p )
2020-05-09 09:50:50 +00:00
q = None
if ctx . obj [ " queue " ] :
q = create_queue_from_context ( ctx )
2020-05-20 07:22:19 +00:00
process_domains ( domains , ctx . obj [ " visit " ] , p , db , q )
2020-05-09 09:50:50 +00:00
2020-06-08 14:09:47 +00:00
@cli.command ( help = " Visit domains from file " )
@click.pass_context
@click.argument ( " name " )
def file ( ctx , name ) :
db = create_database_from_context ( ctx )
p = ctx . obj [ " parser " ]
domains = [ ]
with open ( name ) as f :
for l in f :
domains . append ( ( l . strip ( ) , 0 ) )
q = None
if ctx . obj [ " queue " ] :
q = create_queue_from_context ( ctx )
process_domains ( domains , ctx . obj [ " visit " ] , p , db , q )
2021-01-21 09:47:29 +00:00
@cli.command ( help = " Visit one url and get links. Start here " )
2020-05-07 14:09:45 +00:00
@click.pass_context
@click.argument ( " link " )
2020-05-09 09:50:50 +00:00
def start ( ctx , link ) :
2020-05-07 14:09:45 +00:00
db = create_database_from_context ( ctx )
p = ctx . obj [ " parser " ]
c = Connection ( )
2020-05-09 09:50:50 +00:00
visit_links ( [ link ] , c , p , db )
2021-01-20 12:54:47 +00:00
#db.check_domain(domain)
2020-05-09 09:50:50 +00:00
@cli.command ( help = " Continue crawling of seen links from a domain " )
@click.pass_context
@click.argument ( " domain " )
def crawl ( ctx , domain ) :
db = create_database_from_context ( ctx )
p = ctx . obj [ " parser " ]
c = Connection ( )
links = db . get_visit_links ( domain , p . recent_links , p . old_links , p . random_links )
visit_links ( links , c , p , db )
db . check_domain ( domain )
2020-05-07 14:09:45 +00:00
@cli.command ( help = " Update domain statistics " )
@click.pass_context
@click.argument ( " domain " )
def check ( ctx , domain ) :
db = create_database_from_context ( ctx )
res = db . check_domain ( domain )
print ( res )
2023-02-26 13:10:58 +00:00
@cli.command ( help = " Export domain as JSON doc per line " )
@click.pass_context
@click.argument ( " domain " )
def tojson ( ctx , domain ) :
db = create_database_from_context ( ctx )
db . export_domain ( domain )
2020-05-07 14:09:45 +00:00
@cli.command ( help = " Print daily report " )
@click.pass_context
def report ( ctx ) :
db = create_database_from_context ( ctx )
db . daily_report ( )
2020-05-10 09:48:17 +00:00
try :
2020-05-09 09:50:50 +00:00
q = create_queue_from_context ( ctx )
stats = q . stats_tube ( ctx . obj [ " beanstalkd_tube " ] )
buried = stats [ " current-jobs-buried " ]
2020-05-10 09:48:17 +00:00
ready = stats [ " current-jobs-ready " ]
2020-05-13 13:20:20 +00:00
print ( " queue {} at {} : {} " . format ( ctx . obj [ " beanstalkd_tube " ] , ctx . obj [ " beanstalkd_host " ] , ctx . obj [ " beanstalkd_port " ] ) )
2020-05-09 09:50:50 +00:00
print ( " {} ready jobs, {} burried jobs " . format ( ready , buried ) )
2020-06-04 11:44:22 +00:00
except Exception as err :
2020-05-10 09:48:17 +00:00
print ( err )
@cli.command ( help = " Database summary " )
@click.pass_context
def summary ( ctx ) :
db = create_database_from_context ( ctx )
p = ctx . obj [ " parser " ]
db . summary ( p )
2023-02-28 07:56:35 +00:00
@cli.command ( help = " Create database " )
@click.pass_context
@click.argument ( " replication " , default = 1 )
2023-02-28 12:57:13 +00:00
def create_database ( ctx , replication ) :
2023-02-28 07:56:35 +00:00
cluster = websucker . db . connect_cluster ( ctx . obj [ " cassandra_host " ] , ctx . obj [ " cassandra_port " ] , ctx . obj [ " cassandra_username " ] , ctx . obj [ " cassandra_password " ] )
with cluster . connect ( ) as session :
2023-02-28 12:57:13 +00:00
import cassandra . cqlengine . connection
import cassandra . cqlengine . management as man
2023-02-28 07:56:35 +00:00
session . set_keyspace ( ctx . obj [ " cassandra_keyspace " ] )
2023-02-28 12:57:13 +00:00
cassandra . cqlengine . connection . set_session ( session )
keyspace = ctx . obj [ " cassandra_keyspace " ]
man . drop_keyspace ( keyspace )
man . create_keyspace_simple ( keyspace , replication )
2023-02-28 11:55:44 +00:00
websucker . schema . create_database ( ctx . obj [ " cassandra_keyspace " ] , session )
2023-02-28 07:56:35 +00:00
2020-05-07 14:09:45 +00:00
@cli.command ( help = " Print keyspace schema " )
def schema ( ) :
schema = get_schema ( )
print ( schema )
@cli.command ( help = " Fetch given url (just for debug) " )
@click.pass_context
@click.argument ( " urls " )
def fetch ( ctx , urls ) :
parser = ctx . obj [ " parser " ]
# Visit first page
connection = Connection ( )
responses = connection . html_download2 ( urls )
for res in responses :
target_link = res . get_canonical ( )
2023-03-06 15:29:30 +00:00
pd = parser . full_extract ( res . content , res . bs , target_link )
2020-05-07 14:09:45 +00:00
print ( pd )
if __name__ == " __main__ " :
cli ( )