Merge branch 'master' of git.kemt.fei.tuke.sk:dano/websucker-pip
This commit is contained in:
		
						commit
						ed1d4701b8
					
				
							
								
								
									
										51
									
								
								mongo/cli.py
									
									
									
									
									
								
							
							
						
						
									
										51
									
								
								mongo/cli.py
									
									
									
									
									
								
							@ -4,9 +4,9 @@ import rq
 | 
				
			|||||||
import redis
 | 
					import redis
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
 | 
					import pymongo
 | 
				
			||||||
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/")
 | 
					import courlan
 | 
				
			||||||
QUEUES=os.getenv("QUEUES","high,default,low")
 | 
					from config import *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@click.group()
 | 
					@click.group()
 | 
				
			||||||
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
 | 
					@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
 | 
				
			||||||
@ -47,6 +47,14 @@ def visit(hostname,filter_content=True):
 | 
				
			|||||||
    """ Hostname to crawl """
 | 
					    """ Hostname to crawl """
 | 
				
			||||||
    mongocrawler.visit(hostname,filter_content=filter_content)
 | 
					    mongocrawler.visit(hostname,filter_content=filter_content)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command()
 | 
				
			||||||
 | 
					@click.argument("hostname")
 | 
				
			||||||
 | 
					def linksummary(hostname):
 | 
				
			||||||
 | 
					    myclient = pymongo.MongoClient(CONNECTION)
 | 
				
			||||||
 | 
					    db=myclient[DBNAME]
 | 
				
			||||||
 | 
					    mongocrawler.link_summary(db,hostname)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@cli.command()
 | 
					@cli.command()
 | 
				
			||||||
def summary():
 | 
					def summary():
 | 
				
			||||||
    mongocrawler.crawl_summary()
 | 
					    mongocrawler.crawl_summary()
 | 
				
			||||||
@ -55,11 +63,46 @@ def summary():
 | 
				
			|||||||
def sampledomains():
 | 
					def sampledomains():
 | 
				
			||||||
    mongocrawler.sample_domains()
 | 
					    mongocrawler.sample_domains()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command()
 | 
				
			||||||
 | 
					@click.argument("domain")
 | 
				
			||||||
 | 
					def sample(domain):
 | 
				
			||||||
 | 
					    myclient = pymongo.MongoClient(CONNECTION)
 | 
				
			||||||
 | 
					    db=myclient[DBNAME]
 | 
				
			||||||
 | 
					    links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
 | 
				
			||||||
 | 
					    print(links)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command()
 | 
				
			||||||
 | 
					@click.argument("start_link")
 | 
				
			||||||
 | 
					def fetchlinks(start_link):
 | 
				
			||||||
 | 
					    myclient = pymongo.MongoClient(CONNECTION)
 | 
				
			||||||
 | 
					    db=myclient[DBNAME]
 | 
				
			||||||
 | 
					    start_link,hostname = courlan.check_url(start_link)
 | 
				
			||||||
 | 
					    rules = mongocrawler.fetch_robot(hostname)
 | 
				
			||||||
 | 
					    links = mongocrawler.fetch_front_links(start_link,rules)
 | 
				
			||||||
 | 
					    for link in links:
 | 
				
			||||||
 | 
					        print(link[0])
 | 
				
			||||||
 | 
					    #print(front_links)
 | 
				
			||||||
 | 
					    mongocrawler.index_links(db,links)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cli.command()
 | 
				
			||||||
 | 
					@click.argument(hostname)
 | 
				
			||||||
 | 
					def process_links():
 | 
				
			||||||
 | 
					    rules = mongocrawler.fetch_robot(hostname)
 | 
				
			||||||
 | 
					    outfile = "data.jsonl"
 | 
				
			||||||
 | 
					    links = []
 | 
				
			||||||
 | 
					    for line in sys.stdin:
 | 
				
			||||||
 | 
					        links.append(line.rstrip())
 | 
				
			||||||
 | 
					    extracted_pages, extracted_links = fetch_and_extract(links,rules)
 | 
				
			||||||
 | 
					    for page in extracted_pages:
 | 
				
			||||||
 | 
					        print(page)
 | 
				
			||||||
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@cli.command(help="Enqueue a list of links into redis queue for crawling")
 | 
					@cli.command(help="Enqueue a list of links into redis queue for crawling")
 | 
				
			||||||
def enqueue():
 | 
					def enqueue():
 | 
				
			||||||
    # TODO: select queues
 | 
					    # TODO: select queues
 | 
				
			||||||
    q = rq.Queue(connection=redis.from_url(REDIS_URL))
 | 
					    q = rq.Queue(connection=redis.from_url(CONNECTION))
 | 
				
			||||||
    for l in sys.stdin:
 | 
					    for l in sys.stdin:
 | 
				
			||||||
        print(l.strip())
 | 
					        print(l.strip())
 | 
				
			||||||
        r = q.enqueue(mongocrawler.visit, l.strip())
 | 
					        r = q.enqueue(mongocrawler.visit, l.strip())
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										24
									
								
								mongo/config.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								mongo/config.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,24 @@
 | 
				
			|||||||
 | 
					import os
 | 
				
			||||||
 | 
					# database options
 | 
				
			||||||
 | 
					CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					QUEUES=os.getenv("QUEUES","high,default,low")
 | 
				
			||||||
 | 
					DBNAME=os.getenv("SUCKER_DBNAME","crawler")
 | 
				
			||||||
 | 
					# retrieving filter
 | 
				
			||||||
 | 
					BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
 | 
				
			||||||
 | 
					MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
 | 
				
			||||||
 | 
					MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
 | 
				
			||||||
 | 
					# document originality filter
 | 
				
			||||||
 | 
					MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
 | 
				
			||||||
 | 
					CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
 | 
				
			||||||
 | 
					TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
 | 
				
			||||||
 | 
					# link and domain sampling
 | 
				
			||||||
 | 
					DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
 | 
				
			||||||
 | 
					SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
 | 
				
			||||||
 | 
					CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
 | 
				
			||||||
 | 
					# link filter
 | 
				
			||||||
 | 
					LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
 | 
				
			||||||
 | 
					DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
 | 
				
			||||||
 | 
					STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -27,25 +27,7 @@ import os.path
 | 
				
			|||||||
import binascii
 | 
					import binascii
 | 
				
			||||||
import json
 | 
					import json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# database options
 | 
					from config import *
 | 
				
			||||||
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
 | 
					 | 
				
			||||||
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
 | 
					 | 
				
			||||||
# retrieving filter
 | 
					 | 
				
			||||||
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
 | 
					 | 
				
			||||||
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
 | 
					 | 
				
			||||||
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
 | 
					 | 
				
			||||||
# document originality filter
 | 
					 | 
				
			||||||
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
 | 
					 | 
				
			||||||
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
 | 
					 | 
				
			||||||
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
 | 
					 | 
				
			||||||
# link and domain sampling
 | 
					 | 
				
			||||||
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
 | 
					 | 
				
			||||||
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
 | 
					 | 
				
			||||||
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
 | 
					 | 
				
			||||||
# link filter
 | 
					 | 
				
			||||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
 | 
					 | 
				
			||||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
 | 
					 | 
				
			||||||
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_bs_links(link,html):
 | 
					def get_bs_links(link,html):
 | 
				
			||||||
    # Extrakcia linkov zo stranky
 | 
					    # Extrakcia linkov zo stranky
 | 
				
			||||||
@ -691,6 +673,20 @@ def index_pages(db,hostname,extracted_pages,filter_content):
 | 
				
			|||||||
        docs.append(doc)
 | 
					        docs.append(doc)
 | 
				
			||||||
    save_batch_info(db,hostname,final_states,docs)
 | 
					    save_batch_info(db,hostname,final_states,docs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def fetch_and_extract(links,rules):
 | 
				
			||||||
 | 
					    print("Processing links")
 | 
				
			||||||
 | 
					    responses = []
 | 
				
			||||||
 | 
					    for link in links:
 | 
				
			||||||
 | 
					        responses.append(fetch_page(link))
 | 
				
			||||||
 | 
					    extracted_pages = []
 | 
				
			||||||
 | 
					    for original_link,(final_link,html) in zip(links,responses):
 | 
				
			||||||
 | 
					        doc = None
 | 
				
			||||||
 | 
					        assert original_link is not None
 | 
				
			||||||
 | 
					        doc = extract_page(final_link,html)
 | 
				
			||||||
 | 
					        extracted_pages.append((original_link,html,doc))
 | 
				
			||||||
 | 
					    extracted_links = extract_links(links,responses,rules,"frontlink")
 | 
				
			||||||
 | 
					    return extracted_pages, extracted_links
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
def visit(hostname,filter_content=True):
 | 
					def visit(hostname,filter_content=True):
 | 
				
			||||||
    myclient = pymongo.MongoClient(CONNECTION)
 | 
					    myclient = pymongo.MongoClient(CONNECTION)
 | 
				
			||||||
@ -710,15 +706,7 @@ def visit(hostname,filter_content=True):
 | 
				
			|||||||
    print(links)
 | 
					    print(links)
 | 
				
			||||||
    # index results
 | 
					    # index results
 | 
				
			||||||
    print("Processing links")
 | 
					    print("Processing links")
 | 
				
			||||||
    responses = []
 | 
					    extracted_pages, extracted_links = fetch_and_extract(links,rules)
 | 
				
			||||||
    for link in links:
 | 
					 | 
				
			||||||
        responses.append(fetch_page(link))
 | 
					 | 
				
			||||||
    extracted_pages = []
 | 
					 | 
				
			||||||
    for original_link,(final_link,html) in zip(links,responses):
 | 
					 | 
				
			||||||
        doc = None
 | 
					 | 
				
			||||||
        assert original_link is not None
 | 
					 | 
				
			||||||
        doc = extract_page(final_link,html)
 | 
					 | 
				
			||||||
        extracted_pages.append((original_link,html,doc))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
 | 
					    extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -731,7 +719,6 @@ def visit(hostname,filter_content=True):
 | 
				
			|||||||
        docs.append(doc)
 | 
					        docs.append(doc)
 | 
				
			||||||
    save_batch_info(db,hostname,final_states,docs)
 | 
					    save_batch_info(db,hostname,final_states,docs)
 | 
				
			||||||
    index_pages(db,hostname,extracted_pages,filter_content)
 | 
					    index_pages(db,hostname,extracted_pages,filter_content)
 | 
				
			||||||
    extracted_links = extract_links(links,responses,rules,"frontlink")
 | 
					 | 
				
			||||||
    index_links(db, extracted_links)
 | 
					    index_links(db, extracted_links)
 | 
				
			||||||
    link_summary(db,hostname)
 | 
					    link_summary(db,hostname)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										5
									
								
								mongo/start-docker-devstack.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										5
									
								
								mongo/start-docker-devstack.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,5 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/sh
 | 
				
			||||||
 | 
					docker pull redis
 | 
				
			||||||
 | 
					docker pull mongo
 | 
				
			||||||
 | 
					docker pull mongo-express
 | 
				
			||||||
 | 
					docker stack deploy -c ./docker-compose.yaml websucker
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user