diff --git a/websucker/cli.py b/websucker/cli.py index ce63fa1..862cd66 100644 --- a/websucker/cli.py +++ b/websucker/cli.py @@ -2,13 +2,13 @@ from websucker.agent import Connection,visit_links,visit_domain from websucker.agent import ParsedDocument from websucker.parser import BaseParser from websucker.parser import normalize_link,urlunparse +from websucker.parser import load_parser from websucker.db import Data from websucker.db import get_schema import click import pprint - def create_database_from_context(ctx): return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"]) @@ -17,13 +17,16 @@ def create_database_from_context(ctx): @click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True) @click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True) @click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True) - @click.option("--justext-language",metavar="JUSTEXT_LANGUAGE",help="Target language (if defined, value read from JUSTEXT_LANGUAGE env variable)",envvar="JUSTEXT_LANGUAGE",default="English",show_default=True) +@click.option("--parser",metavar="file_name",help="zzz") @click.option("--visit",is_flag=True) -def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,justext_language,visit): +def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,justext_language,parser,visit): ctx.ensure_object(dict) p = BaseParser() p.justext_language = justext_language + if parser is not None: + p = load_parser(parser) + assert p is not None ctx.obj["parser"] = p ctx.obj["cassandra_host"] = cassandra_host ctx.obj["cassandra_port"] = cassandra_port diff --git a/websucker/parser.py b/websucker/parser.py index 86bb9f7..05b6428 100644 --- a/websucker/parser.py +++ b/websucker/parser.py @@ -8,6 +8,11 @@ import lxml.etree import urllib.parse import os.path +import importlib +import sys +import os +import inspect + datere = re.compile("\d{1,2}\.\s*\d{1,2}\.\s*[12]\d{3}") yearre = re.compile(r"\s\d{4}\s") @@ -333,3 +338,40 @@ class EnglishParser(BaseParser): self.justext_language = "English" self.allowdomains = set(["com","org","io"]) +# https://github.com/scrapy/scrapy/blob/master/scrapy/commands/runspider.py +def _import_file(filepath): + abspath = os.path.abspath(filepath) + dirname, file = os.path.split(abspath) + fname, fext = os.path.splitext(file) + if fext != '.py': + raise ValueError("Not a Python source file: %s" % abspath) + if dirname: + sys.path = [dirname] + sys.path + try: + module = importlib.import_module(fname) + finally: + if dirname: + sys.path.pop(0) + return module + +def iter_parser(module): + """Return an iterator over all spider classes defined in the given module + that can be instantiated (i.e. which have name) + """ + # this needs to be imported here until get rid of the spider manager + # singleton in scrapy.spider.spiders + for obj in vars(module).values(): + + if inspect.isclass(obj) and \ + obj.__module__ == module.__name__ and \ + issubclass(obj, BaseParser): + yield obj + +def load_parser(file_name): + pmodule = _import_file(file_name) + parsers = [m for m in iter_parser(pmodule)] + p = None + if len(parsers)> 0: + pc = parsers[-1] + p = pc() + return p