zz
This commit is contained in:
parent
0c9ea2b4e3
commit
abeef76afb
@ -2,13 +2,13 @@ from websucker.agent import Connection,visit_links,visit_domain
|
||||
from websucker.agent import ParsedDocument
|
||||
from websucker.parser import BaseParser
|
||||
from websucker.parser import normalize_link,urlunparse
|
||||
from websucker.parser import load_parser
|
||||
from websucker.db import Data
|
||||
from websucker.db import get_schema
|
||||
import click
|
||||
import pprint
|
||||
|
||||
|
||||
|
||||
def create_database_from_context(ctx):
|
||||
return Data(ctx.obj["cassandra_keyspace"],ctx.obj["cassandra_host"],ctx.obj["cassandra_port"])
|
||||
|
||||
@ -17,13 +17,16 @@ def create_database_from_context(ctx):
|
||||
@click.option("--cassandra-keyspace",metavar="CASSANDRA_KEYSPACE",help="cassandra keyspace (if defined, value read from CASSANDRA_KEYSPACE env variable)",envvar="CASSANDRA_KEYSPACE",default="websucker",show_default=True)
|
||||
@click.option("--cassandra-host",metavar="CASSANDRA_HOST",help="cassandra host (if defined, value read from CASSANDRA_HOST env variable)",envvar="CASSANDRA_HOST",default="127.0.0.1",show_default=True)
|
||||
@click.option("--cassandra-port",metavar="CASSANDRA_PORT",help="cassandra port (if defined, value read from CASSANDRA_PORT env variable)",envvar="CASSANDRA_PORT",default=9042,show_default=True)
|
||||
|
||||
@click.option("--justext-language",metavar="JUSTEXT_LANGUAGE",help="Target language (if defined, value read from JUSTEXT_LANGUAGE env variable)",envvar="JUSTEXT_LANGUAGE",default="English",show_default=True)
|
||||
@click.option("--parser",metavar="file_name",help="zzz")
|
||||
@click.option("--visit",is_flag=True)
|
||||
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,justext_language,visit):
|
||||
def cli(ctx,cassandra_keyspace,cassandra_host,cassandra_port,justext_language,parser,visit):
|
||||
ctx.ensure_object(dict)
|
||||
p = BaseParser()
|
||||
p.justext_language = justext_language
|
||||
if parser is not None:
|
||||
p = load_parser(parser)
|
||||
assert p is not None
|
||||
ctx.obj["parser"] = p
|
||||
ctx.obj["cassandra_host"] = cassandra_host
|
||||
ctx.obj["cassandra_port"] = cassandra_port
|
||||
|
@ -8,6 +8,11 @@ import lxml.etree
|
||||
import urllib.parse
|
||||
import os.path
|
||||
|
||||
import importlib
|
||||
import sys
|
||||
import os
|
||||
import inspect
|
||||
|
||||
|
||||
datere = re.compile("\d{1,2}\.\s*\d{1,2}\.\s*[12]\d{3}")
|
||||
yearre = re.compile(r"\s\d{4}\s")
|
||||
@ -333,3 +338,40 @@ class EnglishParser(BaseParser):
|
||||
self.justext_language = "English"
|
||||
self.allowdomains = set(["com","org","io"])
|
||||
|
||||
# https://github.com/scrapy/scrapy/blob/master/scrapy/commands/runspider.py
|
||||
def _import_file(filepath):
|
||||
abspath = os.path.abspath(filepath)
|
||||
dirname, file = os.path.split(abspath)
|
||||
fname, fext = os.path.splitext(file)
|
||||
if fext != '.py':
|
||||
raise ValueError("Not a Python source file: %s" % abspath)
|
||||
if dirname:
|
||||
sys.path = [dirname] + sys.path
|
||||
try:
|
||||
module = importlib.import_module(fname)
|
||||
finally:
|
||||
if dirname:
|
||||
sys.path.pop(0)
|
||||
return module
|
||||
|
||||
def iter_parser(module):
|
||||
"""Return an iterator over all spider classes defined in the given module
|
||||
that can be instantiated (i.e. which have name)
|
||||
"""
|
||||
# this needs to be imported here until get rid of the spider manager
|
||||
# singleton in scrapy.spider.spiders
|
||||
for obj in vars(module).values():
|
||||
|
||||
if inspect.isclass(obj) and \
|
||||
obj.__module__ == module.__name__ and \
|
||||
issubclass(obj, BaseParser):
|
||||
yield obj
|
||||
|
||||
def load_parser(file_name):
|
||||
pmodule = _import_file(file_name)
|
||||
parsers = [m for m in iter_parser(pmodule)]
|
||||
p = None
|
||||
if len(parsers)> 0:
|
||||
pc = parsers[-1]
|
||||
p = pc()
|
||||
return p
|
||||
|
Loading…
Reference in New Issue
Block a user