zz
This commit is contained in:
parent
b838a9bbd6
commit
c4733776e5
@ -47,6 +47,14 @@ def visit(hostname,filter_content=True):
|
|||||||
""" Hostname to crawl """
|
""" Hostname to crawl """
|
||||||
mongocrawler.visit(hostname,filter_content=filter_content)
|
mongocrawler.visit(hostname,filter_content=filter_content)
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument("hostname")
|
||||||
|
def linksummary(hostname):
|
||||||
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
|
db=myclient[DBNAME]
|
||||||
|
mongocrawler.link_summary(db,hostname)
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
def summary():
|
def summary():
|
||||||
mongocrawler.crawl_summary()
|
mongocrawler.crawl_summary()
|
||||||
|
@ -671,8 +671,22 @@ def index_pages(db,hostname,extracted_pages,filter_content):
|
|||||||
final_states.append(status)
|
final_states.append(status)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
save_batch_info(db,hostname,final_states,docs)
|
save_batch_info(db,hostname,final_states,docs)
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_and_extract(links,rules):
|
||||||
|
print("Processing links")
|
||||||
|
responses = []
|
||||||
|
for link in links:
|
||||||
|
responses.append(fetch_page(link))
|
||||||
|
extracted_pages = []
|
||||||
|
for original_link,(final_link,html) in zip(links,responses):
|
||||||
|
doc = None
|
||||||
|
assert original_link is not None
|
||||||
|
doc = extract_page(final_link,html)
|
||||||
|
extracted_pages.append((original_link,html,doc))
|
||||||
|
extracted_links = extract_links(links,responses,rules,"frontlink")
|
||||||
|
return extracted_pages, extracted_links
|
||||||
|
|
||||||
|
|
||||||
def visit(hostname,filter_content=True):
|
def visit(hostname,filter_content=True):
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
@ -700,9 +714,9 @@ def visit(hostname,filter_content=True):
|
|||||||
assert original_link is not None
|
assert original_link is not None
|
||||||
doc = extract_page(final_link,html)
|
doc = extract_page(final_link,html)
|
||||||
extracted_pages.append((original_link,html,doc))
|
extracted_pages.append((original_link,html,doc))
|
||||||
|
extracted_links = extract_links(links,responses,rules,"frontlink")
|
||||||
|
|
||||||
index_pages(db,hostname,extracted_pages,filter_content)
|
index_pages(db,hostname,extracted_pages,filter_content)
|
||||||
extracted_links = extract_links(links,responses,rules,"frontlink")
|
|
||||||
index_links(db, extracted_links)
|
index_links(db, extracted_links)
|
||||||
link_summary(db,hostname)
|
link_summary(db,hostname)
|
||||||
|
|
||||||
|
5
mongo/start-docker-devstack.sh
Executable file
5
mongo/start-docker-devstack.sh
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
#!/usr/bin/sh
|
||||||
|
docker pull redis
|
||||||
|
docker pull mongo
|
||||||
|
docker pull mongo-express
|
||||||
|
docker stack deploy -c ./docker-compose.yaml websucker
|
Loading…
Reference in New Issue
Block a user