zz
This commit is contained in:
		
							parent
							
								
									b838a9bbd6
								
							
						
					
					
						commit
						c4733776e5
					
				@ -47,6 +47,14 @@ def visit(hostname,filter_content=True):
 | 
				
			|||||||
    """ Hostname to crawl """
 | 
					    """ Hostname to crawl """
 | 
				
			||||||
    mongocrawler.visit(hostname,filter_content=filter_content)
 | 
					    mongocrawler.visit(hostname,filter_content=filter_content)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command()
 | 
				
			||||||
 | 
					@click.argument("hostname")
 | 
				
			||||||
 | 
					def linksummary(hostname):
 | 
				
			||||||
 | 
					    myclient = pymongo.MongoClient(CONNECTION)
 | 
				
			||||||
 | 
					    db=myclient[DBNAME]
 | 
				
			||||||
 | 
					    mongocrawler.link_summary(db,hostname)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@cli.command()
 | 
					@cli.command()
 | 
				
			||||||
def summary():
 | 
					def summary():
 | 
				
			||||||
    mongocrawler.crawl_summary()
 | 
					    mongocrawler.crawl_summary()
 | 
				
			||||||
 | 
				
			|||||||
@ -671,8 +671,22 @@ def index_pages(db,hostname,extracted_pages,filter_content):
 | 
				
			|||||||
        final_states.append(status)
 | 
					        final_states.append(status)
 | 
				
			||||||
        docs.append(doc)
 | 
					        docs.append(doc)
 | 
				
			||||||
    save_batch_info(db,hostname,final_states,docs)
 | 
					    save_batch_info(db,hostname,final_states,docs)
 | 
				
			||||||
 
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def fetch_and_extract(links,rules):
 | 
				
			||||||
 | 
					    print("Processing links")
 | 
				
			||||||
 | 
					    responses = []
 | 
				
			||||||
 | 
					    for link in links:
 | 
				
			||||||
 | 
					        responses.append(fetch_page(link))
 | 
				
			||||||
 | 
					    extracted_pages = []
 | 
				
			||||||
 | 
					    for original_link,(final_link,html) in zip(links,responses):
 | 
				
			||||||
 | 
					        doc = None
 | 
				
			||||||
 | 
					        assert original_link is not None
 | 
				
			||||||
 | 
					        doc = extract_page(final_link,html)
 | 
				
			||||||
 | 
					        extracted_pages.append((original_link,html,doc))
 | 
				
			||||||
 | 
					    extracted_links = extract_links(links,responses,rules,"frontlink")
 | 
				
			||||||
 | 
					    return extracted_pages, extracted_links
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
def visit(hostname,filter_content=True):
 | 
					def visit(hostname,filter_content=True):
 | 
				
			||||||
    myclient = pymongo.MongoClient(CONNECTION)
 | 
					    myclient = pymongo.MongoClient(CONNECTION)
 | 
				
			||||||
    db=myclient[DBNAME]
 | 
					    db=myclient[DBNAME]
 | 
				
			||||||
@ -700,9 +714,9 @@ def visit(hostname,filter_content=True):
 | 
				
			|||||||
        assert original_link is not None
 | 
					        assert original_link is not None
 | 
				
			||||||
        doc = extract_page(final_link,html)
 | 
					        doc = extract_page(final_link,html)
 | 
				
			||||||
        extracted_pages.append((original_link,html,doc))
 | 
					        extracted_pages.append((original_link,html,doc))
 | 
				
			||||||
 | 
					    extracted_links = extract_links(links,responses,rules,"frontlink")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    index_pages(db,hostname,extracted_pages,filter_content)
 | 
					    index_pages(db,hostname,extracted_pages,filter_content)
 | 
				
			||||||
    extracted_links = extract_links(links,responses,rules,"frontlink")
 | 
					 | 
				
			||||||
    index_links(db, extracted_links)
 | 
					    index_links(db, extracted_links)
 | 
				
			||||||
    link_summary(db,hostname)
 | 
					    link_summary(db,hostname)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										5
									
								
								mongo/start-docker-devstack.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										5
									
								
								mongo/start-docker-devstack.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,5 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/sh
 | 
				
			||||||
 | 
					docker pull redis
 | 
				
			||||||
 | 
					docker pull mongo
 | 
				
			||||||
 | 
					docker pull mongo-express
 | 
				
			||||||
 | 
					docker stack deploy -c ./docker-compose.yaml websucker
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user