From 34fc9f91243dfade18783fe5e690208682672761 Mon Sep 17 00:00:00 2001
From: Daniel Hladek <daniel.hladek@tuke.sk>
Date: Mon, 17 Apr 2023 14:32:52 +0200
Subject: [PATCH] zz

---
 mongo/mongocrawler.py | 49 ++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py
index e05bee8..6677429 100644
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@@ -25,22 +25,25 @@ from bs4 import BeautifulSoup
 import urllib.parse
 import os.path
 
-LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
-DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
-BATCHSIZE=int(os.getenv("SUCKER_BATCHSIZE","10"))
+# database options
 CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
 DBNAME=os.getenv("SUCKER_DBNAME","crawler")
-MINFILESIZE=300
-MAXFILESIZE=10000000
-MINTEXTSIZE=200
-CHECK_PARAGRAPH_SIZE=150
-TEXT_TRASH_SIZE=200
-TEXT_TRASH_RATIO=0.6
-DISCOVER_LINK_RATIO = 0.3
-SAMPLE_SET_SIZE =10000
-CLASSIFIER_SET_SIZE = 200
-STOP_PATHS=["xml","rss","login","admin"]
-
+# retrieving filter
+BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
+MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
+MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
+# document originality filter
+MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
+CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
+TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
+# link and domain sampling
+DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
+SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
+CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
+# link filter
+LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
+DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
+STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
 
 def get_bs_links(link,html):
     # Extrakcia linkov zo stranky
@@ -166,11 +169,11 @@ def fetch_page(link:str)->(str,str):
         if response.status != 200:
             good = False
             LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
-        elif response.data is None or len(response.data) < MINFILESIZE:
+        elif response.data is None or len(response.data) < MIN_FILE_SIZE:
             LOGGER.error('too small/incorrect for URL %s', link)
             good = False
         # raise error instead?
-        elif len(response.data) > MAXFILESIZE:
+        elif len(response.data) > MAX_FILE_SIZE:
             good = False
             LOGGER.error('too large: length %s for URL %s', len(response.data), link)
         if good:
@@ -202,7 +205,7 @@ def extract_page(final_link,html):
     if html is not None:
         doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
         if doc is not None:
-            if not "text" in doc or len(doc["text"]) < MINTEXTSIZE:
+            if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
                 # text too small
                 doc = None
     return doc
@@ -243,14 +246,13 @@ def index_page(db,original_link,final_link,html,doc):
         set_content_checksums(doc)
         tsz = doc["text_size"]
         psz = doc["paragraph_sizes_sum"]
-        if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
+        if tsz < MIN_TEXT_SIZE or psz/tsz < TEXT_TRASH_RATIO:
             state = "small"
     # check copy
     if state == "good":
         origsz = 0
         for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
             # index paragraph checksums
-            print(checkcol)
             nd = checkcol.find_one({"_id":chs})
             if nd is None:
                 origsz += paragraph_size
@@ -258,7 +260,6 @@ def index_page(db,original_link,final_link,html,doc):
 
         if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
             state = "copy"
-        print(origsz)
     if state == "good":
         htdoc = get_link_doc(link,state)
         htdoc["html"] = html
@@ -675,7 +676,7 @@ def classify(start_link):
 def visit(hostname):
     myclient = pymongo.MongoClient(CONNECTION)
     db=myclient[DBNAME]
-    batch_size = BATCHSIZE
+    batch_size = BATCH_SIZE
     rules = fetch_robot(hostname)
     start_link = "https://" + hostname
     # renew front links
@@ -730,7 +731,7 @@ def crawl_summary():
         {"$sort":{"original_text_size":-1}},
     ])
     print(">>>> Batches")
-    headers = ["_id","document_count","good_document_count","batch_count","text_size","original_text_size"]
+    headers = ["_id","document_count","good_document_count","batch_size","original_text_size"]
     print("\t".join(headers))
     for item in res:
         values = [str(item[x]) for x in headers]
@@ -761,7 +762,7 @@ def sample_domains():
     all_domains = []
     for domain in domains:
         all_domains.append(domain)
-    sample_size = min(int(DISCOVER_LINK_RATIO* BATCHSIZE), len(all_domains))
+    sample_size = min(int(DISCOVER_LINK_RATIO* BATCH_SIZE), len(all_domains))
     print(">>> Discover domains {}".format(sample_size))
     sample_domains = random.sample(all_domains,sample_size)
     domaincol = db["domains"]
@@ -770,7 +771,7 @@ def sample_domains():
     all_domains = []
     for item in res:
         all_domains.append(item["host"])
-    sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCHSIZE),len(all_domains))
+    sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCH_SIZE),len(all_domains))
     print(">>>> Best domains {}".format(sample_size))
     sample_domains += random.sample(all_domains,sample_size)
     for domain in sample_domains: