This commit is contained in:
Daniel Hládek 2023-04-09 09:13:15 +02:00
parent 9d06223012
commit 3dc4aa6290
3 changed files with 42 additions and 14 deletions

View File

@ -4,4 +4,13 @@ COPY requirements.txt /app
RUN pip install -r /app/requirements.txt RUN pip install -r /app/requirements.txt
COPY *.py /app COPY *.py /app
WORKDIR /app WORKDIR /app
# redis config
ENV REDIS_URL="redis://localhost:6379/"
ENV QUEUES="high,default,low"
# sucker config
ENV SUCKER_LANGUAGE="sk"
ENV SUCKER_DOMAIN="sk"
ENV SUCKER_BATCHSIZE="10"
ENV SUCKER_CONNECTION="mongodb://root:example@localhost:27017/"
ENV SUCKER_DBNAME="crawler"
ENTRYPOINT ["rq", "worker"] ENTRYPOINT ["rq", "worker"]

33
mongo/cli.py Normal file
View File

@ -0,0 +1,33 @@
import click
import mongocrawler
@click.group()
def cli():
pass
@cli.command()
def createdb():
mongocrawler.createdb()
@cli.command()
@click.argument("link")
def parseurl(link):
mongocrawler.parseurl(link)
@cli.command()
@click.argument("link")
def externaldomains(link):
mongocrawler.externaldomains(link)
@cli.command()
@click.argument("start_link")
def classify(start_link):
mongocrawler.classify(start_link)
@cli.command()
@click.argument("start_link")
def visit(start_link):
mongocrawler.visit(start_link)
if __name__ == "__main__":
cli()

View File

@ -591,11 +591,7 @@ def domain_summary(db,hostname):
for item in res: for item in res:
print(item) print(item)
@click.group()
def cli():
pass
@cli.command()
def createdb(): def createdb():
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
@ -614,8 +610,6 @@ def createdb():
domaincol.create_index("host",unique=True) domaincol.create_index("host",unique=True)
domaincol.create_index("average_fetch_characters",unique=True) domaincol.create_index("average_fetch_characters",unique=True)
@cli.command()
@click.argument("link")
def parseurl(link): def parseurl(link):
link,hostname = courlan.check_url(link) link,hostname = courlan.check_url(link)
rawrules = trafilatura.fetch_url("https://"+ hostname + "/robots.txt") rawrules = trafilatura.fetch_url("https://"+ hostname + "/robots.txt")
@ -631,8 +625,6 @@ def parseurl(link):
import pprint import pprint
pprint.pprint(doc) pprint.pprint(doc)
@cli.command()
@click.argument("link")
def externaldomains(link): def externaldomains(link):
html = trafilatura.fetch_url(link,decode=True) html = trafilatura.fetch_url(link,decode=True)
external_links = courlan.extract_links(html,link,external_bool=True,language=LANGUAGE) external_links = courlan.extract_links(html,link,external_bool=True,language=LANGUAGE)
@ -646,8 +638,6 @@ def externaldomains(link):
for d in domains: for d in domains:
print(d) print(d)
@cli.command()
@click.argument("start_link")
def classify(start_link): def classify(start_link):
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
@ -660,8 +650,6 @@ def classify(start_link):
cl.train(trainset) cl.train(trainset)
cl.test(testset) cl.test(testset)
@cli.command()
@click.argument("start_link")
def visit(start_link): def visit(start_link):
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
@ -686,5 +674,3 @@ def visit(start_link):
index_pages(db,hostname,extracted_pages) index_pages(db,hostname,extracted_pages)
link_summary(db,hostname) link_summary(db,hostname)
if __name__ == "__main__":
cli()