From 3dc4aa62908c52f92b52f3a2bb3679c1257906db Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Sun, 9 Apr 2023 09:13:15 +0200 Subject: [PATCH] zz --- mongo/Dockerfile | 9 +++++++++ mongo/cli.py | 33 +++++++++++++++++++++++++++++++++ mongo/mongocrawler.py | 14 -------------- 3 files changed, 42 insertions(+), 14 deletions(-) create mode 100644 mongo/cli.py diff --git a/mongo/Dockerfile b/mongo/Dockerfile index 1a2ab22..8997a47 100644 --- a/mongo/Dockerfile +++ b/mongo/Dockerfile @@ -4,4 +4,13 @@ COPY requirements.txt /app RUN pip install -r /app/requirements.txt COPY *.py /app WORKDIR /app +# redis config +ENV REDIS_URL="redis://localhost:6379/" +ENV QUEUES="high,default,low" +# sucker config +ENV SUCKER_LANGUAGE="sk" +ENV SUCKER_DOMAIN="sk" +ENV SUCKER_BATCHSIZE="10" +ENV SUCKER_CONNECTION="mongodb://root:example@localhost:27017/" +ENV SUCKER_DBNAME="crawler" ENTRYPOINT ["rq", "worker"] diff --git a/mongo/cli.py b/mongo/cli.py new file mode 100644 index 0000000..bb508d4 --- /dev/null +++ b/mongo/cli.py @@ -0,0 +1,33 @@ +import click +import mongocrawler + +@click.group() +def cli(): + pass + +@cli.command() +def createdb(): + mongocrawler.createdb() + +@cli.command() +@click.argument("link") +def parseurl(link): + mongocrawler.parseurl(link) + +@cli.command() +@click.argument("link") +def externaldomains(link): + mongocrawler.externaldomains(link) + +@cli.command() +@click.argument("start_link") +def classify(start_link): + mongocrawler.classify(start_link) + +@cli.command() +@click.argument("start_link") +def visit(start_link): + mongocrawler.visit(start_link) + +if __name__ == "__main__": + cli() diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index b67e37a..a95cfbf 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -591,11 +591,7 @@ def domain_summary(db,hostname): for item in res: print(item) -@click.group() -def cli(): - pass -@cli.command() def createdb(): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] @@ -614,8 +610,6 @@ def createdb(): domaincol.create_index("host",unique=True) domaincol.create_index("average_fetch_characters",unique=True) -@cli.command() -@click.argument("link") def parseurl(link): link,hostname = courlan.check_url(link) rawrules = trafilatura.fetch_url("https://"+ hostname + "/robots.txt") @@ -631,8 +625,6 @@ def parseurl(link): import pprint pprint.pprint(doc) -@cli.command() -@click.argument("link") def externaldomains(link): html = trafilatura.fetch_url(link,decode=True) external_links = courlan.extract_links(html,link,external_bool=True,language=LANGUAGE) @@ -646,8 +638,6 @@ def externaldomains(link): for d in domains: print(d) -@cli.command() -@click.argument("start_link") def classify(start_link): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] @@ -660,8 +650,6 @@ def classify(start_link): cl.train(trainset) cl.test(testset) -@cli.command() -@click.argument("start_link") def visit(start_link): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] @@ -686,5 +674,3 @@ def visit(start_link): index_pages(db,hostname,extracted_pages) link_summary(db,hostname) -if __name__ == "__main__": - cli()