zz
This commit is contained in:
parent
9d06223012
commit
3dc4aa6290
@ -4,4 +4,13 @@ COPY requirements.txt /app
|
|||||||
RUN pip install -r /app/requirements.txt
|
RUN pip install -r /app/requirements.txt
|
||||||
COPY *.py /app
|
COPY *.py /app
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
# redis config
|
||||||
|
ENV REDIS_URL="redis://localhost:6379/"
|
||||||
|
ENV QUEUES="high,default,low"
|
||||||
|
# sucker config
|
||||||
|
ENV SUCKER_LANGUAGE="sk"
|
||||||
|
ENV SUCKER_DOMAIN="sk"
|
||||||
|
ENV SUCKER_BATCHSIZE="10"
|
||||||
|
ENV SUCKER_CONNECTION="mongodb://root:example@localhost:27017/"
|
||||||
|
ENV SUCKER_DBNAME="crawler"
|
||||||
ENTRYPOINT ["rq", "worker"]
|
ENTRYPOINT ["rq", "worker"]
|
||||||
|
33
mongo/cli.py
Normal file
33
mongo/cli.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
import click
|
||||||
|
import mongocrawler
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
def cli():
|
||||||
|
pass
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
def createdb():
|
||||||
|
mongocrawler.createdb()
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument("link")
|
||||||
|
def parseurl(link):
|
||||||
|
mongocrawler.parseurl(link)
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument("link")
|
||||||
|
def externaldomains(link):
|
||||||
|
mongocrawler.externaldomains(link)
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument("start_link")
|
||||||
|
def classify(start_link):
|
||||||
|
mongocrawler.classify(start_link)
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument("start_link")
|
||||||
|
def visit(start_link):
|
||||||
|
mongocrawler.visit(start_link)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cli()
|
@ -591,11 +591,7 @@ def domain_summary(db,hostname):
|
|||||||
for item in res:
|
for item in res:
|
||||||
print(item)
|
print(item)
|
||||||
|
|
||||||
@click.group()
|
|
||||||
def cli():
|
|
||||||
pass
|
|
||||||
|
|
||||||
@cli.command()
|
|
||||||
def createdb():
|
def createdb():
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
@ -614,8 +610,6 @@ def createdb():
|
|||||||
domaincol.create_index("host",unique=True)
|
domaincol.create_index("host",unique=True)
|
||||||
domaincol.create_index("average_fetch_characters",unique=True)
|
domaincol.create_index("average_fetch_characters",unique=True)
|
||||||
|
|
||||||
@cli.command()
|
|
||||||
@click.argument("link")
|
|
||||||
def parseurl(link):
|
def parseurl(link):
|
||||||
link,hostname = courlan.check_url(link)
|
link,hostname = courlan.check_url(link)
|
||||||
rawrules = trafilatura.fetch_url("https://"+ hostname + "/robots.txt")
|
rawrules = trafilatura.fetch_url("https://"+ hostname + "/robots.txt")
|
||||||
@ -631,8 +625,6 @@ def parseurl(link):
|
|||||||
import pprint
|
import pprint
|
||||||
pprint.pprint(doc)
|
pprint.pprint(doc)
|
||||||
|
|
||||||
@cli.command()
|
|
||||||
@click.argument("link")
|
|
||||||
def externaldomains(link):
|
def externaldomains(link):
|
||||||
html = trafilatura.fetch_url(link,decode=True)
|
html = trafilatura.fetch_url(link,decode=True)
|
||||||
external_links = courlan.extract_links(html,link,external_bool=True,language=LANGUAGE)
|
external_links = courlan.extract_links(html,link,external_bool=True,language=LANGUAGE)
|
||||||
@ -646,8 +638,6 @@ def externaldomains(link):
|
|||||||
for d in domains:
|
for d in domains:
|
||||||
print(d)
|
print(d)
|
||||||
|
|
||||||
@cli.command()
|
|
||||||
@click.argument("start_link")
|
|
||||||
def classify(start_link):
|
def classify(start_link):
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
@ -660,8 +650,6 @@ def classify(start_link):
|
|||||||
cl.train(trainset)
|
cl.train(trainset)
|
||||||
cl.test(testset)
|
cl.test(testset)
|
||||||
|
|
||||||
@cli.command()
|
|
||||||
@click.argument("start_link")
|
|
||||||
def visit(start_link):
|
def visit(start_link):
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
@ -686,5 +674,3 @@ def visit(start_link):
|
|||||||
index_pages(db,hostname,extracted_pages)
|
index_pages(db,hostname,extracted_pages)
|
||||||
link_summary(db,hostname)
|
link_summary(db,hostname)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
cli()
|
|
||||||
|
Loading…
Reference in New Issue
Block a user