From 34fc9f91243dfade18783fe5e690208682672761 Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Mon, 17 Apr 2023 14:32:52 +0200 Subject: [PATCH] zz --- mongo/mongocrawler.py | 49 ++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index e05bee8..6677429 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -25,22 +25,25 @@ from bs4 import BeautifulSoup import urllib.parse import os.path -LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") -DOMAIN = os.getenv("SUCKER_DOMAIN","sk") -BATCHSIZE=int(os.getenv("SUCKER_BATCHSIZE","10")) +# database options CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") DBNAME=os.getenv("SUCKER_DBNAME","crawler") -MINFILESIZE=300 -MAXFILESIZE=10000000 -MINTEXTSIZE=200 -CHECK_PARAGRAPH_SIZE=150 -TEXT_TRASH_SIZE=200 -TEXT_TRASH_RATIO=0.6 -DISCOVER_LINK_RATIO = 0.3 -SAMPLE_SET_SIZE =10000 -CLASSIFIER_SET_SIZE = 200 -STOP_PATHS=["xml","rss","login","admin"] - +# retrieving filter +BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10")) +MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300")) +MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000")) +# document originality filter +MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200")) +CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150")) +TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6")) +# link and domain sampling +DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3")) +SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000")) +CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200")) +# link filter +LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") +DOMAIN = os.getenv("SUCKER_DOMAIN","sk") +STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",") def get_bs_links(link,html): # Extrakcia linkov zo stranky @@ -166,11 +169,11 @@ def fetch_page(link:str)->(str,str): if response.status != 200: good = False LOGGER.error('not a 200 response: %s for URL %s', response.status, url) - elif response.data is None or len(response.data) < MINFILESIZE: + elif response.data is None or len(response.data) < MIN_FILE_SIZE: LOGGER.error('too small/incorrect for URL %s', link) good = False # raise error instead? - elif len(response.data) > MAXFILESIZE: + elif len(response.data) > MAX_FILE_SIZE: good = False LOGGER.error('too large: length %s for URL %s', len(response.data), link) if good: @@ -202,7 +205,7 @@ def extract_page(final_link,html): if html is not None: doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True) if doc is not None: - if not "text" in doc or len(doc["text"]) < MINTEXTSIZE: + if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE: # text too small doc = None return doc @@ -243,14 +246,13 @@ def index_page(db,original_link,final_link,html,doc): set_content_checksums(doc) tsz = doc["text_size"] psz = doc["paragraph_sizes_sum"] - if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO: + if tsz < MIN_TEXT_SIZE or psz/tsz < TEXT_TRASH_RATIO: state = "small" # check copy if state == "good": origsz = 0 for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]): # index paragraph checksums - print(checkcol) nd = checkcol.find_one({"_id":chs}) if nd is None: origsz += paragraph_size @@ -258,7 +260,6 @@ def index_page(db,original_link,final_link,html,doc): if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO: state = "copy" - print(origsz) if state == "good": htdoc = get_link_doc(link,state) htdoc["html"] = html @@ -675,7 +676,7 @@ def classify(start_link): def visit(hostname): myclient = pymongo.MongoClient(CONNECTION) db=myclient[DBNAME] - batch_size = BATCHSIZE + batch_size = BATCH_SIZE rules = fetch_robot(hostname) start_link = "https://" + hostname # renew front links @@ -730,7 +731,7 @@ def crawl_summary(): {"$sort":{"original_text_size":-1}}, ]) print(">>>> Batches") - headers = ["_id","document_count","good_document_count","batch_count","text_size","original_text_size"] + headers = ["_id","document_count","good_document_count","batch_size","original_text_size"] print("\t".join(headers)) for item in res: values = [str(item[x]) for x in headers] @@ -761,7 +762,7 @@ def sample_domains(): all_domains = [] for domain in domains: all_domains.append(domain) - sample_size = min(int(DISCOVER_LINK_RATIO* BATCHSIZE), len(all_domains)) + sample_size = min(int(DISCOVER_LINK_RATIO* BATCH_SIZE), len(all_domains)) print(">>> Discover domains {}".format(sample_size)) sample_domains = random.sample(all_domains,sample_size) domaincol = db["domains"] @@ -770,7 +771,7 @@ def sample_domains(): all_domains = [] for item in res: all_domains.append(item["host"]) - sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCHSIZE),len(all_domains)) + sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCH_SIZE),len(all_domains)) print(">>>> Best domains {}".format(sample_size)) sample_domains += random.sample(all_domains,sample_size) for domain in sample_domains: