zz
This commit is contained in:
parent
9d06223012
commit
3dc4aa6290
@ -4,4 +4,13 @@ COPY requirements.txt /app
|
||||
RUN pip install -r /app/requirements.txt
|
||||
COPY *.py /app
|
||||
WORKDIR /app
|
||||
# redis config
|
||||
ENV REDIS_URL="redis://localhost:6379/"
|
||||
ENV QUEUES="high,default,low"
|
||||
# sucker config
|
||||
ENV SUCKER_LANGUAGE="sk"
|
||||
ENV SUCKER_DOMAIN="sk"
|
||||
ENV SUCKER_BATCHSIZE="10"
|
||||
ENV SUCKER_CONNECTION="mongodb://root:example@localhost:27017/"
|
||||
ENV SUCKER_DBNAME="crawler"
|
||||
ENTRYPOINT ["rq", "worker"]
|
||||
|
33
mongo/cli.py
Normal file
33
mongo/cli.py
Normal file
@ -0,0 +1,33 @@
|
||||
import click
|
||||
import mongocrawler
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
pass
|
||||
|
||||
@cli.command()
|
||||
def createdb():
|
||||
mongocrawler.createdb()
|
||||
|
||||
@cli.command()
|
||||
@click.argument("link")
|
||||
def parseurl(link):
|
||||
mongocrawler.parseurl(link)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("link")
|
||||
def externaldomains(link):
|
||||
mongocrawler.externaldomains(link)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("start_link")
|
||||
def classify(start_link):
|
||||
mongocrawler.classify(start_link)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("start_link")
|
||||
def visit(start_link):
|
||||
mongocrawler.visit(start_link)
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
@ -591,11 +591,7 @@ def domain_summary(db,hostname):
|
||||
for item in res:
|
||||
print(item)
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
pass
|
||||
|
||||
@cli.command()
|
||||
def createdb():
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
@ -614,8 +610,6 @@ def createdb():
|
||||
domaincol.create_index("host",unique=True)
|
||||
domaincol.create_index("average_fetch_characters",unique=True)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("link")
|
||||
def parseurl(link):
|
||||
link,hostname = courlan.check_url(link)
|
||||
rawrules = trafilatura.fetch_url("https://"+ hostname + "/robots.txt")
|
||||
@ -631,8 +625,6 @@ def parseurl(link):
|
||||
import pprint
|
||||
pprint.pprint(doc)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("link")
|
||||
def externaldomains(link):
|
||||
html = trafilatura.fetch_url(link,decode=True)
|
||||
external_links = courlan.extract_links(html,link,external_bool=True,language=LANGUAGE)
|
||||
@ -646,8 +638,6 @@ def externaldomains(link):
|
||||
for d in domains:
|
||||
print(d)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("start_link")
|
||||
def classify(start_link):
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
@ -660,8 +650,6 @@ def classify(start_link):
|
||||
cl.train(trainset)
|
||||
cl.test(testset)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("start_link")
|
||||
def visit(start_link):
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
@ -686,5 +674,3 @@ def visit(start_link):
|
||||
index_pages(db,hostname,extracted_pages)
|
||||
link_summary(db,hostname)
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
|
Loading…
Reference in New Issue
Block a user