zz
This commit is contained in:
parent
b838a9bbd6
commit
c4733776e5
@ -47,6 +47,14 @@ def visit(hostname,filter_content=True):
|
||||
""" Hostname to crawl """
|
||||
mongocrawler.visit(hostname,filter_content=filter_content)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("hostname")
|
||||
def linksummary(hostname):
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
mongocrawler.link_summary(db,hostname)
|
||||
|
||||
|
||||
@cli.command()
|
||||
def summary():
|
||||
mongocrawler.crawl_summary()
|
||||
|
@ -671,8 +671,22 @@ def index_pages(db,hostname,extracted_pages,filter_content):
|
||||
final_states.append(status)
|
||||
docs.append(doc)
|
||||
save_batch_info(db,hostname,final_states,docs)
|
||||
|
||||
|
||||
def fetch_and_extract(links,rules):
|
||||
print("Processing links")
|
||||
responses = []
|
||||
for link in links:
|
||||
responses.append(fetch_page(link))
|
||||
extracted_pages = []
|
||||
for original_link,(final_link,html) in zip(links,responses):
|
||||
doc = None
|
||||
assert original_link is not None
|
||||
doc = extract_page(final_link,html)
|
||||
extracted_pages.append((original_link,html,doc))
|
||||
extracted_links = extract_links(links,responses,rules,"frontlink")
|
||||
return extracted_pages, extracted_links
|
||||
|
||||
|
||||
def visit(hostname,filter_content=True):
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
@ -700,9 +714,9 @@ def visit(hostname,filter_content=True):
|
||||
assert original_link is not None
|
||||
doc = extract_page(final_link,html)
|
||||
extracted_pages.append((original_link,html,doc))
|
||||
extracted_links = extract_links(links,responses,rules,"frontlink")
|
||||
|
||||
index_pages(db,hostname,extracted_pages,filter_content)
|
||||
extracted_links = extract_links(links,responses,rules,"frontlink")
|
||||
index_links(db, extracted_links)
|
||||
link_summary(db,hostname)
|
||||
|
||||
|
5
mongo/start-docker-devstack.sh
Executable file
5
mongo/start-docker-devstack.sh
Executable file
@ -0,0 +1,5 @@
|
||||
#!/usr/bin/sh
|
||||
docker pull redis
|
||||
docker pull mongo
|
||||
docker pull mongo-express
|
||||
docker stack deploy -c ./docker-compose.yaml websucker
|
Loading…
Reference in New Issue
Block a user