This commit is contained in:
Daniel Hládek 2024-03-21 12:58:42 +01:00
parent b838a9bbd6
commit c4733776e5
3 changed files with 29 additions and 2 deletions

View File

@ -47,6 +47,14 @@ def visit(hostname,filter_content=True):
""" Hostname to crawl """
mongocrawler.visit(hostname,filter_content=filter_content)
@cli.command()
@click.argument("hostname")
def linksummary(hostname):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
mongocrawler.link_summary(db,hostname)
@cli.command()
def summary():
mongocrawler.crawl_summary()

View File

@ -671,8 +671,22 @@ def index_pages(db,hostname,extracted_pages,filter_content):
final_states.append(status)
docs.append(doc)
save_batch_info(db,hostname,final_states,docs)
def fetch_and_extract(links,rules):
print("Processing links")
responses = []
for link in links:
responses.append(fetch_page(link))
extracted_pages = []
for original_link,(final_link,html) in zip(links,responses):
doc = None
assert original_link is not None
doc = extract_page(final_link,html)
extracted_pages.append((original_link,html,doc))
extracted_links = extract_links(links,responses,rules,"frontlink")
return extracted_pages, extracted_links
def visit(hostname,filter_content=True):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
@ -700,9 +714,9 @@ def visit(hostname,filter_content=True):
assert original_link is not None
doc = extract_page(final_link,html)
extracted_pages.append((original_link,html,doc))
extracted_links = extract_links(links,responses,rules,"frontlink")
index_pages(db,hostname,extracted_pages,filter_content)
extracted_links = extract_links(links,responses,rules,"frontlink")
index_links(db, extracted_links)
link_summary(db,hostname)

5
mongo/start-docker-devstack.sh Executable file
View File

@ -0,0 +1,5 @@
#!/usr/bin/sh
docker pull redis
docker pull mongo
docker pull mongo-express
docker stack deploy -c ./docker-compose.yaml websucker