This commit is contained in:
Daniel Hládek 2023-04-17 14:32:52 +02:00
parent 4a42078bef
commit 34fc9f9124

View File

@ -25,22 +25,25 @@ from bs4 import BeautifulSoup
import urllib.parse import urllib.parse
import os.path import os.path
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") # database options
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
BATCHSIZE=int(os.getenv("SUCKER_BATCHSIZE","10"))
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
DBNAME=os.getenv("SUCKER_DBNAME","crawler") DBNAME=os.getenv("SUCKER_DBNAME","crawler")
MINFILESIZE=300 # retrieving filter
MAXFILESIZE=10000000 BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
MINTEXTSIZE=200 MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
CHECK_PARAGRAPH_SIZE=150 MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
TEXT_TRASH_SIZE=200 # document originality filter
TEXT_TRASH_RATIO=0.6 MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
DISCOVER_LINK_RATIO = 0.3 CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
SAMPLE_SET_SIZE =10000 TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
CLASSIFIER_SET_SIZE = 200 # link and domain sampling
STOP_PATHS=["xml","rss","login","admin"] DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
# link filter
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
def get_bs_links(link,html): def get_bs_links(link,html):
# Extrakcia linkov zo stranky # Extrakcia linkov zo stranky
@ -166,11 +169,11 @@ def fetch_page(link:str)->(str,str):
if response.status != 200: if response.status != 200:
good = False good = False
LOGGER.error('not a 200 response: %s for URL %s', response.status, url) LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
elif response.data is None or len(response.data) < MINFILESIZE: elif response.data is None or len(response.data) < MIN_FILE_SIZE:
LOGGER.error('too small/incorrect for URL %s', link) LOGGER.error('too small/incorrect for URL %s', link)
good = False good = False
# raise error instead? # raise error instead?
elif len(response.data) > MAXFILESIZE: elif len(response.data) > MAX_FILE_SIZE:
good = False good = False
LOGGER.error('too large: length %s for URL %s', len(response.data), link) LOGGER.error('too large: length %s for URL %s', len(response.data), link)
if good: if good:
@ -202,7 +205,7 @@ def extract_page(final_link,html):
if html is not None: if html is not None:
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True) doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
if doc is not None: if doc is not None:
if not "text" in doc or len(doc["text"]) < MINTEXTSIZE: if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
# text too small # text too small
doc = None doc = None
return doc return doc
@ -243,14 +246,13 @@ def index_page(db,original_link,final_link,html,doc):
set_content_checksums(doc) set_content_checksums(doc)
tsz = doc["text_size"] tsz = doc["text_size"]
psz = doc["paragraph_sizes_sum"] psz = doc["paragraph_sizes_sum"]
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO: if tsz < MIN_TEXT_SIZE or psz/tsz < TEXT_TRASH_RATIO:
state = "small" state = "small"
# check copy # check copy
if state == "good": if state == "good":
origsz = 0 origsz = 0
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]): for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
# index paragraph checksums # index paragraph checksums
print(checkcol)
nd = checkcol.find_one({"_id":chs}) nd = checkcol.find_one({"_id":chs})
if nd is None: if nd is None:
origsz += paragraph_size origsz += paragraph_size
@ -258,7 +260,6 @@ def index_page(db,original_link,final_link,html,doc):
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO: if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
state = "copy" state = "copy"
print(origsz)
if state == "good": if state == "good":
htdoc = get_link_doc(link,state) htdoc = get_link_doc(link,state)
htdoc["html"] = html htdoc["html"] = html
@ -675,7 +676,7 @@ def classify(start_link):
def visit(hostname): def visit(hostname):
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
batch_size = BATCHSIZE batch_size = BATCH_SIZE
rules = fetch_robot(hostname) rules = fetch_robot(hostname)
start_link = "https://" + hostname start_link = "https://" + hostname
# renew front links # renew front links
@ -730,7 +731,7 @@ def crawl_summary():
{"$sort":{"original_text_size":-1}}, {"$sort":{"original_text_size":-1}},
]) ])
print(">>>> Batches") print(">>>> Batches")
headers = ["_id","document_count","good_document_count","batch_count","text_size","original_text_size"] headers = ["_id","document_count","good_document_count","batch_size","original_text_size"]
print("\t".join(headers)) print("\t".join(headers))
for item in res: for item in res:
values = [str(item[x]) for x in headers] values = [str(item[x]) for x in headers]
@ -761,7 +762,7 @@ def sample_domains():
all_domains = [] all_domains = []
for domain in domains: for domain in domains:
all_domains.append(domain) all_domains.append(domain)
sample_size = min(int(DISCOVER_LINK_RATIO* BATCHSIZE), len(all_domains)) sample_size = min(int(DISCOVER_LINK_RATIO* BATCH_SIZE), len(all_domains))
print(">>> Discover domains {}".format(sample_size)) print(">>> Discover domains {}".format(sample_size))
sample_domains = random.sample(all_domains,sample_size) sample_domains = random.sample(all_domains,sample_size)
domaincol = db["domains"] domaincol = db["domains"]
@ -770,7 +771,7 @@ def sample_domains():
all_domains = [] all_domains = []
for item in res: for item in res:
all_domains.append(item["host"]) all_domains.append(item["host"])
sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCHSIZE),len(all_domains)) sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCH_SIZE),len(all_domains))
print(">>>> Best domains {}".format(sample_size)) print(">>>> Best domains {}".format(sample_size))
sample_domains += random.sample(all_domains,sample_size) sample_domains += random.sample(all_domains,sample_size)
for domain in sample_domains: for domain in sample_domains: