zz
This commit is contained in:
		
							parent
							
								
									ed1d4701b8
								
							
						
					
					
						commit
						5d45569651
					
				
							
								
								
									
										15
									
								
								mongo/cli.py
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								mongo/cli.py
									
									
									
									
									
								
							@ -2,6 +2,7 @@ import click
 | 
			
		||||
import mongocrawler
 | 
			
		||||
import rq
 | 
			
		||||
import redis
 | 
			
		||||
import json
 | 
			
		||||
import sys
 | 
			
		||||
import os
 | 
			
		||||
import pymongo
 | 
			
		||||
@ -69,7 +70,8 @@ def sample(domain):
 | 
			
		||||
    myclient = pymongo.MongoClient(CONNECTION)
 | 
			
		||||
    db=myclient[DBNAME]
 | 
			
		||||
    links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
 | 
			
		||||
    print(links)
 | 
			
		||||
    for link in links:
 | 
			
		||||
        print(link)
 | 
			
		||||
 | 
			
		||||
@cli.command()
 | 
			
		||||
@click.argument("start_link")
 | 
			
		||||
@ -86,17 +88,18 @@ def fetchlinks(start_link):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@cli.command()
 | 
			
		||||
@click.argument(hostname)
 | 
			
		||||
def process_links():
 | 
			
		||||
@click.argument("hostname")
 | 
			
		||||
def processlinks(hostname):
 | 
			
		||||
    rules = mongocrawler.fetch_robot(hostname)
 | 
			
		||||
    outfile = "data.jsonl"
 | 
			
		||||
    links = []
 | 
			
		||||
    for line in sys.stdin:
 | 
			
		||||
        links.append(line.rstrip())
 | 
			
		||||
    extracted_pages, extracted_links = fetch_and_extract(links,rules)
 | 
			
		||||
    extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
 | 
			
		||||
    with open(outfile,"w") as of:
 | 
			
		||||
        for page in extracted_pages:
 | 
			
		||||
        print(page)
 | 
			
		||||
    pass
 | 
			
		||||
            doc = json.dumps(page)
 | 
			
		||||
            print(page,file=of)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@cli.command(help="Enqueue a list of links into redis queue for crawling")
 | 
			
		||||
 | 
			
		||||
@ -530,7 +530,6 @@ def link_summary(db,hostname):
 | 
			
		||||
    print(res)
 | 
			
		||||
 | 
			
		||||
def sample_links(db,hostname,status,batch_size):
 | 
			
		||||
    print("Sampling links")
 | 
			
		||||
    linkcol = db["links"]
 | 
			
		||||
    res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
 | 
			
		||||
    cl = LinkClassifier()
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user