This commit is contained in:
Daniel Hladek 2020-05-08 07:53:50 +02:00
parent 0c9ea2b4e3
commit abeef76afb
2 changed files with 48 additions and 3 deletions

View File

@ -2,13 +2,13 @@ from websucker.agent import Connection,visit_links,visit_domain
from websucker.agent import ParsedDocument
from websucker.parser import BaseParser
from websucker.parser import normalize_link,urlunparse
from websucker.parser import load_parser
from websucker.db import Data
from websucker.db import get_schema
import click
import pprint
def create_database_from_context(ctx):
return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"])
@ -17,13 +17,16 @@ def create_database_from_context(ctx):
@click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True)
@click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True)
@click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True)
@click.option("--justext-language",metavar="JUSTEXT_LANGUAGE",help="Target language (if defined, value read from JUSTEXT_LANGUAGE env variable)",envvar="JUSTEXT_LANGUAGE",default="English",show_default=True)
@click.option("--parser",metavar="file_name",help="zzz")
@click.option("--visit",is_flag=True)
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,justext_language,visit):
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,justext_language,parser,visit):
ctx.ensure_object(dict)
p = BaseParser()
p.justext_language = justext_language
if parser is not None:
p = load_parser(parser)
assert p is not None
ctx.obj["parser"] = p
ctx.obj["cassandra_host"] = cassandra_host
ctx.obj["cassandra_port"] = cassandra_port

View File

@ -8,6 +8,11 @@ import lxml.etree
import urllib.parse
import os.path
import importlib
import sys
import os
import inspect
datere = re.compile("\d{1,2}\.\s*\d{1,2}\.\s*[12]\d{3}")
yearre = re.compile(r"\s\d{4}\s")
@ -333,3 +338,40 @@ class EnglishParser(BaseParser):
self.justext_language = "English"
self.allowdomains = set(["com","org","io"])
# https://github.com/scrapy/scrapy/blob/master/scrapy/commands/runspider.py
def _import_file(filepath):
abspath = os.path.abspath(filepath)
dirname, file = os.path.split(abspath)
fname, fext = os.path.splitext(file)
if fext != '.py':
raise ValueError("Not a Python source file: %s" % abspath)
if dirname:
sys.path = [dirname] + sys.path
try:
module = importlib.import_module(fname)
finally:
if dirname:
sys.path.pop(0)
return module
def iter_parser(module):
"""Return an iterator over all spider classes defined in the given module
that can be instantiated (i.e. which have name)
"""
# this needs to be imported here until get rid of the spider manager
# singleton in scrapy.spider.spiders
for obj in vars(module).values():
if inspect.isclass(obj) and \
obj.__module__ == module.__name__ and \
issubclass(obj, BaseParser):
yield obj
def load_parser(file_name):
pmodule = _import_file(file_name)
parsers = [m for m in iter_parser(pmodule)]
p = None
if len(parsers)> 0:
pc = parsers[-1]
p = pc()
return p