zz
This commit is contained in:
		
							parent
							
								
									b838a9bbd6
								
							
						
					
					
						commit
						c4733776e5
					
				@ -47,6 +47,14 @@ def visit(hostname,filter_content=True):
 | 
			
		||||
    """ Hostname to crawl """
 | 
			
		||||
    mongocrawler.visit(hostname,filter_content=filter_content)
 | 
			
		||||
 | 
			
		||||
@cli.command()
 | 
			
		||||
@click.argument("hostname")
 | 
			
		||||
def linksummary(hostname):
 | 
			
		||||
    myclient = pymongo.MongoClient(CONNECTION)
 | 
			
		||||
    db=myclient[DBNAME]
 | 
			
		||||
    mongocrawler.link_summary(db,hostname)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@cli.command()
 | 
			
		||||
def summary():
 | 
			
		||||
    mongocrawler.crawl_summary()
 | 
			
		||||
 | 
			
		||||
@ -672,6 +672,20 @@ def index_pages(db,hostname,extracted_pages,filter_content):
 | 
			
		||||
        docs.append(doc)
 | 
			
		||||
    save_batch_info(db,hostname,final_states,docs)
 | 
			
		||||
 | 
			
		||||
def fetch_and_extract(links,rules):
 | 
			
		||||
    print("Processing links")
 | 
			
		||||
    responses = []
 | 
			
		||||
    for link in links:
 | 
			
		||||
        responses.append(fetch_page(link))
 | 
			
		||||
    extracted_pages = []
 | 
			
		||||
    for original_link,(final_link,html) in zip(links,responses):
 | 
			
		||||
        doc = None
 | 
			
		||||
        assert original_link is not None
 | 
			
		||||
        doc = extract_page(final_link,html)
 | 
			
		||||
        extracted_pages.append((original_link,html,doc))
 | 
			
		||||
    extracted_links = extract_links(links,responses,rules,"frontlink")
 | 
			
		||||
    return extracted_pages, extracted_links
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
def visit(hostname,filter_content=True):
 | 
			
		||||
    myclient = pymongo.MongoClient(CONNECTION)
 | 
			
		||||
@ -700,9 +714,9 @@ def visit(hostname,filter_content=True):
 | 
			
		||||
        assert original_link is not None
 | 
			
		||||
        doc = extract_page(final_link,html)
 | 
			
		||||
        extracted_pages.append((original_link,html,doc))
 | 
			
		||||
    extracted_links = extract_links(links,responses,rules,"frontlink")
 | 
			
		||||
 | 
			
		||||
    index_pages(db,hostname,extracted_pages,filter_content)
 | 
			
		||||
    extracted_links = extract_links(links,responses,rules,"frontlink")
 | 
			
		||||
    index_links(db, extracted_links)
 | 
			
		||||
    link_summary(db,hostname)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										5
									
								
								mongo/start-docker-devstack.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										5
									
								
								mongo/start-docker-devstack.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,5 @@
 | 
			
		||||
#!/usr/bin/sh
 | 
			
		||||
docker pull redis
 | 
			
		||||
docker pull mongo
 | 
			
		||||
docker pull mongo-express
 | 
			
		||||
docker stack deploy -c ./docker-compose.yaml websucker
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user