2020-05-09 09:50:50 +00:00
from websucker . agent import Connection , visit_links , visit_domain , process_domains , work_domains
2020-05-07 14:09:45 +00:00
from websucker . agent import ParsedDocument
from websucker . parser import BaseParser
from websucker . parser import normalize_link , urlunparse
2020-05-08 05:53:50 +00:00
from websucker . parser import load_parser
2020-05-07 14:09:45 +00:00
from websucker . db import Data
from websucker . db import get_schema
import click
import pprint
2020-05-09 09:50:50 +00:00
import greenstalk
import os
2020-05-07 14:09:45 +00:00
def create_database_from_context ( ctx ) :
return Data ( ctx . obj [ " cassandra_keyspace " ] , ctx . obj [ " cassandra_host " ] , ctx . obj [ " cassandra_port " ] )
2020-05-09 09:50:50 +00:00
def create_queue_from_context ( ctx ) :
return greenstalk . Client ( ctx . obj [ " beanstalkd_host " ] , ctx . obj [ " beanstalkd_port " ] , use = ctx . obj [ " beanstalkd_tube " ] , watch = ctx . obj [ " beanstalkd_tube " ] , encoding = " utf8 " )
2020-05-07 14:09:45 +00:00
@click.group ( )
@click.pass_context
@click.option ( " --cassandra-keyspace " , metavar = " CASSANDRA_KEYSPACE " , help = " cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable) " , envvar = " CASSANDRA_KEYSPACE " , default = " websucker " , show_default = True )
@click.option ( " --cassandra-host " , metavar = " CASSANDRA_HOST " , help = " cassandra host (if defined, value read from CASSANDRA_HOST env variable) " , envvar = " CASSANDRA_HOST " , default = " 127.0.0.1 " , show_default = True )
@click.option ( " --cassandra-port " , metavar = " CASSANDRA_PORT " , help = " cassandra port (if defined, value read from CASSANDRA_PORT env variable) " , envvar = " CASSANDRA_PORT " , default = 9042 , show_default = True )
2020-05-09 09:50:50 +00:00
@click.option ( " --beanstalkd-tube " , metavar = " BEANSTALKD_TUBE " , help = " beanstalkd keyspace (if defined, value read from BEANSTALKD_TUBE env variable) " , envvar = " BEANSTALKD_TUBE " , default = " websucker " , show_default = True )
@click.option ( " --beanstalkd-host " , metavar = " BEANSTALKD_HOST " , help = " beanstalkd host (if defined, value read from beanstalkd_HOST env variable) " , envvar = " BEANSTALKD_HOST " , default = " 127.0.0.1 " , show_default = True )
@click.option ( " --beanstalkd-port " , metavar = " BEANSTALKD_PORT " , help = " beanstalkd port (if defined, value read from BEANSTALKD_PORT env variable) " , envvar = " BEANSTALKD_PORT " , default = 11300 , show_default = True )
2020-05-07 14:09:45 +00:00
@click.option ( " --justext-language " , metavar = " JUSTEXT_LANGUAGE " , help = " Target language (if defined, value read from JUSTEXT_LANGUAGE env variable) " , envvar = " JUSTEXT_LANGUAGE " , default = " English " , show_default = True )
2020-05-08 05:53:50 +00:00
@click.option ( " --parser " , metavar = " file_name " , help = " zzz " )
2020-05-07 14:09:45 +00:00
@click.option ( " --visit " , is_flag = True )
2020-05-09 09:50:50 +00:00
@click.option ( " --queue " , is_flag = True )
def cli ( ctx , cassandra_keyspace , cassandra_host , cassandra_port , beanstalkd_tube , beanstalkd_host , beanstalkd_port , justext_language , parser , visit , queue ) :
2020-05-07 14:09:45 +00:00
ctx . ensure_object ( dict )
p = BaseParser ( )
p . justext_language = justext_language
2020-05-09 09:50:50 +00:00
suckerfile = os . getcwd ( ) + " /Suckerfile.py "
if os . path . isfile ( suckerfile ) :
parser = suckerfile
2020-05-08 05:53:50 +00:00
if parser is not None :
p = load_parser ( parser )
assert p is not None
2020-05-07 14:09:45 +00:00
ctx . obj [ " parser " ] = p
ctx . obj [ " cassandra_host " ] = cassandra_host
ctx . obj [ " cassandra_port " ] = cassandra_port
ctx . obj [ " cassandra_keyspace " ] = cassandra_keyspace
2020-05-09 09:50:50 +00:00
ctx . obj [ " beanstalkd_host " ] = beanstalkd_host
ctx . obj [ " beanstalkd_port " ] = beanstalkd_port
ctx . obj [ " beanstalkd_tube " ] = beanstalkd_tube
2020-05-07 14:09:45 +00:00
ctx . obj [ " visit " ] = visit
2020-05-09 09:50:50 +00:00
ctx . obj [ " queue " ] = queue
2020-05-07 14:09:45 +00:00
2020-05-09 09:50:50 +00:00
@cli.command ( help = " All domains " )
2020-05-07 14:09:45 +00:00
@click.pass_context
@click.argument ( " count " , type = int , default = 20 )
def all ( ctx , count ) :
db = create_database_from_context ( ctx )
res = db . all_domains ( count )
2020-05-09 09:50:50 +00:00
q = None
if ctx . obj [ " queue " ] :
q = create_queue_from_context ( ctx )
process_domains ( res , ctx . obj [ " visit " ] , ctx . obj [ " parser " ] , db , q )
2020-05-07 14:09:45 +00:00
2020-05-09 09:50:50 +00:00
@cli.command ( help = " Work queue " )
2020-05-07 14:09:45 +00:00
@click.pass_context
2020-05-09 09:50:50 +00:00
def work ( ctx ) :
2020-05-07 14:09:45 +00:00
db = create_database_from_context ( ctx )
2020-05-09 09:50:50 +00:00
q = create_queue_from_context ( ctx )
work_domains ( ctx . obj [ " parser " ] , db , q )
2020-05-07 14:09:45 +00:00
@cli.command ( help = " find best domains " )
@click.pass_context
@click.argument ( " count " , type = int , default = 20 )
#@click.option("visit",is_flag=True)
def best ( ctx , count ) :
db = create_database_from_context ( ctx )
domains = db . get_best_domains ( count )
2020-05-09 09:50:50 +00:00
q = None
if ctx . obj [ " queue " ] :
q = create_queue_from_context ( ctx )
process_domains ( domains , ctx . obj [ " visit " ] , ctx . obj [ " parser " ] , db , q )
2020-05-07 14:09:45 +00:00
@cli.command ( help = " Find unvisited domains, Visit a site, get links and crawl " )
@click.pass_context
@click.argument ( " count " , type = int , default = 20 )
def unvisited ( ctx , count ) :
db = create_database_from_context ( ctx )
domains = db . get_unvisited_domains ( count )
2020-05-09 09:50:50 +00:00
q = None
if ctx . obj [ " queue " ] :
q = create_queue_from_context ( ctx )
process_domains ( domains , ctx . obj [ " visit " ] , ctx . obj [ " parser " ] , db , q )
@cli.command ( help = " Visit url and get links. Start here " )
2020-05-07 14:09:45 +00:00
@click.pass_context
@click.argument ( " link " )
2020-05-09 09:50:50 +00:00
def start ( ctx , link ) :
2020-05-07 14:09:45 +00:00
db = create_database_from_context ( ctx )
p = ctx . obj [ " parser " ]
c = Connection ( )
2020-05-09 09:50:50 +00:00
visit_links ( [ link ] , c , p , db )
db . check_domain ( domain )
@cli.command ( help = " Continue crawling of seen links from a domain " )
@click.pass_context
@click.argument ( " domain " )
def crawl ( ctx , domain ) :
db = create_database_from_context ( ctx )
p = ctx . obj [ " parser " ]
c = Connection ( )
links = db . get_visit_links ( domain , p . recent_links , p . old_links , p . random_links )
visit_links ( links , c , p , db )
db . check_domain ( domain )
2020-05-07 14:09:45 +00:00
@cli.command ( help = " Update domain statistics " )
@click.pass_context
@click.argument ( " domain " )
def check ( ctx , domain ) :
db = create_database_from_context ( ctx )
res = db . check_domain ( domain )
print ( res )
@cli.command ( help = " Print daily report " )
@click.pass_context
def report ( ctx ) :
db = create_database_from_context ( ctx )
db . daily_report ( )
2020-05-09 09:50:50 +00:00
if ctx . obj [ " queue " ] :
q = create_queue_from_context ( ctx )
stats = q . stats_tube ( ctx . obj [ " beanstalkd_tube " ] )
buried = stats [ " current-jobs-buried " ]
ready = stats [ " current-jobs-buried " ]
print ( " {} ready jobs, {} burried jobs " . format ( ready , buried ) )
2020-05-07 14:09:45 +00:00
@cli.command ( help = " Print keyspace schema " )
def schema ( ) :
schema = get_schema ( )
print ( schema )
@cli.command ( help = " Fetch given url (just for debug) " )
@click.pass_context
@click.argument ( " urls " )
def fetch ( ctx , urls ) :
parser = ctx . obj [ " parser " ]
# Visit first page
connection = Connection ( )
responses = connection . html_download2 ( urls )
for res in responses :
target_link = res . get_canonical ( )
pd = ParsedDocument ( parser , target_link )
pd . extract ( res . content , res . bs )
print ( pd )
if __name__ == " __main__ " :
cli ( )