Merge branch 'master' of git.kemt.fei.tuke.sk:dano/websucker-pip

This commit is contained in:
Daniel Hládek 2024-03-21 17:01:42 +01:00
commit ed1d4701b8
4 changed files with 93 additions and 34 deletions

View File

@ -4,9 +4,9 @@ import rq
import redis import redis
import sys import sys
import os import os
import pymongo
REDIS_URL= os.getenv("REDIS_URL","redis://localhost:6379/") import courlan
QUEUES=os.getenv("QUEUES","high,default,low") from config import *
@click.group() @click.group()
@click.option("--dbname",default=mongocrawler.DBNAME,help="database to use") @click.option("--dbname",default=mongocrawler.DBNAME,help="database to use")
@ -47,6 +47,14 @@ def visit(hostname,filter_content=True):
""" Hostname to crawl """ """ Hostname to crawl """
mongocrawler.visit(hostname,filter_content=filter_content) mongocrawler.visit(hostname,filter_content=filter_content)
@cli.command()
@click.argument("hostname")
def linksummary(hostname):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
mongocrawler.link_summary(db,hostname)
@cli.command() @cli.command()
def summary(): def summary():
mongocrawler.crawl_summary() mongocrawler.crawl_summary()
@ -55,11 +63,46 @@ def summary():
def sampledomains(): def sampledomains():
mongocrawler.sample_domains() mongocrawler.sample_domains()
@cli.command()
@click.argument("domain")
def sample(domain):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
print(links)
@cli.command()
@click.argument("start_link")
def fetchlinks(start_link):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
start_link,hostname = courlan.check_url(start_link)
rules = mongocrawler.fetch_robot(hostname)
links = mongocrawler.fetch_front_links(start_link,rules)
for link in links:
print(link[0])
#print(front_links)
mongocrawler.index_links(db,links)
@cli.command()
@click.argument(hostname)
def process_links():
rules = mongocrawler.fetch_robot(hostname)
outfile = "data.jsonl"
links = []
for line in sys.stdin:
links.append(line.rstrip())
extracted_pages, extracted_links = fetch_and_extract(links,rules)
for page in extracted_pages:
print(page)
pass
@cli.command(help="Enqueue a list of links into redis queue for crawling") @cli.command(help="Enqueue a list of links into redis queue for crawling")
def enqueue(): def enqueue():
# TODO: select queues # TODO: select queues
q = rq.Queue(connection=redis.from_url(REDIS_URL)) q = rq.Queue(connection=redis.from_url(CONNECTION))
for l in sys.stdin: for l in sys.stdin:
print(l.strip()) print(l.strip())
r = q.enqueue(mongocrawler.visit, l.strip()) r = q.enqueue(mongocrawler.visit, l.strip())

24
mongo/config.py Normal file
View File

@ -0,0 +1,24 @@
import os
# database options
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
QUEUES=os.getenv("QUEUES","high,default,low")
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
# retrieving filter
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
# document originality filter
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
# link and domain sampling
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
# link filter
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")

View File

@ -27,25 +27,7 @@ import os.path
import binascii import binascii
import json import json
# database options from config import *
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
# retrieving filter
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
# document originality filter
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
# link and domain sampling
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
# link filter
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
def get_bs_links(link,html): def get_bs_links(link,html):
# Extrakcia linkov zo stranky # Extrakcia linkov zo stranky
@ -690,8 +672,22 @@ def index_pages(db,hostname,extracted_pages,filter_content):
final_states.append(status) final_states.append(status)
docs.append(doc) docs.append(doc)
save_batch_info(db,hostname,final_states,docs) save_batch_info(db,hostname,final_states,docs)
def fetch_and_extract(links,rules):
print("Processing links")
responses = []
for link in links:
responses.append(fetch_page(link))
extracted_pages = []
for original_link,(final_link,html) in zip(links,responses):
doc = None
assert original_link is not None
doc = extract_page(final_link,html)
extracted_pages.append((original_link,html,doc))
extracted_links = extract_links(links,responses,rules,"frontlink")
return extracted_pages, extracted_links
def visit(hostname,filter_content=True): def visit(hostname,filter_content=True):
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
@ -710,15 +706,7 @@ def visit(hostname,filter_content=True):
print(links) print(links)
# index results # index results
print("Processing links") print("Processing links")
responses = [] extracted_pages, extracted_links = fetch_and_extract(links,rules)
for link in links:
responses.append(fetch_page(link))
extracted_pages = []
for original_link,(final_link,html) in zip(links,responses):
doc = None
assert original_link is not None
doc = extract_page(final_link,html)
extracted_pages.append((original_link,html,doc))
extracted_links = extract_links(links,responses,hostname,rules,"frontlink") extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
@ -731,7 +719,6 @@ def visit(hostname,filter_content=True):
docs.append(doc) docs.append(doc)
save_batch_info(db,hostname,final_states,docs) save_batch_info(db,hostname,final_states,docs)
index_pages(db,hostname,extracted_pages,filter_content) index_pages(db,hostname,extracted_pages,filter_content)
extracted_links = extract_links(links,responses,rules,"frontlink")
index_links(db, extracted_links) index_links(db, extracted_links)
link_summary(db,hostname) link_summary(db,hostname)

5
mongo/start-docker-devstack.sh Executable file
View File

@ -0,0 +1,5 @@
#!/usr/bin/sh
docker pull redis
docker pull mongo
docker pull mongo-express
docker stack deploy -c ./docker-compose.yaml websucker