zz
This commit is contained in:
		
							parent
							
								
									ed1d4701b8
								
							
						
					
					
						commit
						5d45569651
					
				
							
								
								
									
										17
									
								
								mongo/cli.py
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								mongo/cli.py
									
									
									
									
									
								
							| @ -2,6 +2,7 @@ import click | ||||
| import mongocrawler | ||||
| import rq | ||||
| import redis | ||||
| import json | ||||
| import sys | ||||
| import os | ||||
| import pymongo | ||||
| @ -69,7 +70,8 @@ def sample(domain): | ||||
|     myclient = pymongo.MongoClient(CONNECTION) | ||||
|     db=myclient[DBNAME] | ||||
|     links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE) | ||||
|     print(links) | ||||
|     for link in links: | ||||
|         print(link) | ||||
| 
 | ||||
| @cli.command() | ||||
| @click.argument("start_link") | ||||
| @ -86,17 +88,18 @@ def fetchlinks(start_link): | ||||
| 
 | ||||
| 
 | ||||
| @cli.command() | ||||
| @click.argument(hostname) | ||||
| def process_links(): | ||||
| @click.argument("hostname") | ||||
| def processlinks(hostname): | ||||
|     rules = mongocrawler.fetch_robot(hostname) | ||||
|     outfile = "data.jsonl" | ||||
|     links = [] | ||||
|     for line in sys.stdin: | ||||
|         links.append(line.rstrip()) | ||||
|     extracted_pages, extracted_links = fetch_and_extract(links,rules) | ||||
|     for page in extracted_pages: | ||||
|         print(page) | ||||
|     pass | ||||
|     extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules) | ||||
|     with open(outfile,"w") as of: | ||||
|         for page in extracted_pages: | ||||
|             doc = json.dumps(page) | ||||
|             print(page,file=of) | ||||
| 
 | ||||
| 
 | ||||
| @cli.command(help="Enqueue a list of links into redis queue for crawling") | ||||
|  | ||||
| @ -530,7 +530,6 @@ def link_summary(db,hostname): | ||||
|     print(res) | ||||
| 
 | ||||
| def sample_links(db,hostname,status,batch_size): | ||||
|     print("Sampling links") | ||||
|     linkcol = db["links"] | ||||
|     res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}}) | ||||
|     cl = LinkClassifier() | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user