zz

2023-04-17 15:09:49 +02:00 · 2023-04-17 15:07:58 +02:00 · 2023-04-17 14:32:52 +02:00
2 changed files with 35 additions and 31 deletions
--- a/mongo/cli.py
+++ b/mongo/cli.py
@ -32,9 +32,10 @@ def classify(start_link):
    mongocrawler.classify(start_link)
@cli.command()
-@click.argument("hostname")
+@click.argument("hostname",help="Hostname to crawl")
-def visit(hostname):
+@click.option("--filter_content",default=True,help="Filter content")
-    mongocrawler.visit(hostname)
+def visit(hostname,filter_content=True):
    mongocrawler.visit(hostname,filter_content=filter_content)
@cli.command()
 def summary():
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@ -24,24 +24,28 @@ import hashlib
 from bs4 import BeautifulSoup
 import urllib.parse
 import os.path
 import binascii
 import json
-LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
+# database options
 DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
 BATCHSIZE=int(os.getenv("SUCKER_BATCHSIZE","10"))
 CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
 DBNAME=os.getenv("SUCKER_DBNAME","crawler")
-MINFILESIZE=300
+# retrieving filter
-MAXFILESIZE=10000000
+BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
-MINTEXTSIZE=200
+MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
-CHECK_PARAGRAPH_SIZE=200
+MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
-TEXT_TRASH_SIZE=200
+# document originality filter
-TEXT_TRASH_RATIO=0.6
+MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
-DISCOVER_LINK_RATIO = 0.3
+CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
-DISCOVER_DOMAIN_RATIO = 0.5
+TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
-SAMPLE_SET_SIZE =10000
+# link and domain sampling
-CLASSIFIER_SET_SIZE = 200
+DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
-STOP_PATHS=["xml","rss","login","admin"]
+SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
-
+CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
 # link filter
 LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
 DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
 STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
 def get_bs_links(link,html):
    # Extrakcia linkov zo stranky
@ -165,14 +169,15 @@ def fetch_page(link:str)->(str,str):
    html = None
    if response is not None :
        good = True
        print(response)
        if response.status != 200:
            good = False
            LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
-        elif response.data is None or len(response.data) < MINFILESIZE:
+        elif response.data is None or len(response.data) < MIN_FILE_SIZE:
            LOGGER.error('too small/incorrect for URL %s', link)
            good = False
        # raise error instead?
-        elif len(response.data) > MAXFILESIZE:
+        elif len(response.data) > MAX_FILE_SIZE:
            good = False
            LOGGER.error('too large: length %s for URL %s', len(response.data), link)
        if good:
@ -204,7 +209,7 @@ def extract_page(final_link,html):
    if html is not None:
        doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
        if doc is not None:
-            if not "text" in doc or len(doc["text"]) < MINTEXTSIZE:
+            if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
                # text too small
                doc = None
    return doc
@ -226,7 +231,7 @@ def set_content_checksums(doc):
            sentences += 1
    doc["sentences_count"] = sentences
-def index_page(db,original_link,final_link,html,doc):
+def index_page(db,original_link,final_link,html,doc,filter_content=True):
    linkcol = db["links"]
    htmlcol = db["html"]
    contentcol = db["content"]
@ -244,7 +249,7 @@ def index_page(db,original_link,final_link,html,doc):
        set_content_checksums(doc)
        tsz = doc["text_size"]
        psz = doc["paragraph_sizes_sum"]
-        if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
+        if filter_content and (tsz < MIN_TEXT_SIZE or psz/tsz < TEXT_TRASH_RATIO):
            state = "small"
    # check copy
    if state == "good":
@ -256,7 +261,7 @@ def index_page(db,original_link,final_link,html,doc):
                origsz += paragraph_size
        doc["original_text_size"] = origsz
-        if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
+        if filter_content and (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
            state = "copy"
    if state == "good":
        htdoc = get_link_doc(link,state)
@ -662,10 +667,10 @@ def classify(start_link):
    cl.train(trainset)
    cl.test(testset)
-def visit(hostname):
+def visit(hostname,filter_content=True):
    myclient = pymongo.MongoClient(CONNECTION)
    db=myclient[DBNAME]
-    batch_size = BATCHSIZE
+    batch_size = BATCH_SIZE
    rules = fetch_robot(hostname)
    start_link = "https://" + hostname
    # renew front links
@ -696,7 +701,7 @@ def visit(hostname):
    final_states = []
    docs = []
    for original_link,final_link,html,doc in extracted_pages:
-        status = index_page(db,original_link,final_link,html,doc)
+        status = index_page(db,original_link,final_link,html,doc,filter_content)
        final_states.append(status)
        docs.append(doc)
    save_batch_info(db,hostname,final_states,docs)
@ -726,8 +731,6 @@ def crawl_summary():
        values = [str(item[x]) for x in headers]
        print("\t".join(values))
 import binascii
 import json
 def import_html():
    myclient= pymongo.MongoClient(CONNECTION)
@ -751,7 +754,7 @@ def sample_domains():
    all_domains = []
    for domain in domains:
        all_domains.append(domain)
-    sample_size = min(int(DISCOVER_DOMAIN_RATIO* BATCHSIZE), len(all_domains))
+    sample_size = min(int(DISCOVER_LINK_RATIO* BATCH_SIZE), len(all_domains))
    print(">>> Discover domains {}".format(sample_size))
    sample_domains = random.sample(all_domains,sample_size)
    domaincol = db["domains"]
@ -760,7 +763,7 @@ def sample_domains():
    all_domains = []
    for item in res:
        all_domains.append(item["host"])
-    sample_size = min(int((1 - DISCOVER_DOMAIN_RATIO) * BATCHSIZE),len(all_domains))
+    sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCH_SIZE),len(all_domains))
    print(">>>> Best domains {}".format(sample_size))
    sample_domains += random.sample(all_domains,sample_size)
    for domain in sample_domains:
Author	SHA1	Message	Date
Daniel Hladek	01645b8862	zz	2023-04-17 15:09:49 +02:00
Daniel Hladek	1801f01a99	zz	2023-04-17 15:07:58 +02:00
Daniel Hladek	34fc9f9124	zz	2023-04-17 14:32:52 +02:00