zz
This commit is contained in:
parent
4a42078bef
commit
34fc9f9124
@ -25,22 +25,25 @@ from bs4 import BeautifulSoup
|
|||||||
import urllib.parse
|
import urllib.parse
|
||||||
import os.path
|
import os.path
|
||||||
|
|
||||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
# database options
|
||||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
|
||||||
BATCHSIZE=int(os.getenv("SUCKER_BATCHSIZE","10"))
|
|
||||||
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
||||||
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
||||||
MINFILESIZE=300
|
# retrieving filter
|
||||||
MAXFILESIZE=10000000
|
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
|
||||||
MINTEXTSIZE=200
|
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
|
||||||
CHECK_PARAGRAPH_SIZE=150
|
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
|
||||||
TEXT_TRASH_SIZE=200
|
# document originality filter
|
||||||
TEXT_TRASH_RATIO=0.6
|
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
|
||||||
DISCOVER_LINK_RATIO = 0.3
|
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
|
||||||
SAMPLE_SET_SIZE =10000
|
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
|
||||||
CLASSIFIER_SET_SIZE = 200
|
# link and domain sampling
|
||||||
STOP_PATHS=["xml","rss","login","admin"]
|
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
|
||||||
|
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
|
||||||
|
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
|
||||||
|
# link filter
|
||||||
|
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
||||||
|
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
||||||
|
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
|
||||||
|
|
||||||
def get_bs_links(link,html):
|
def get_bs_links(link,html):
|
||||||
# Extrakcia linkov zo stranky
|
# Extrakcia linkov zo stranky
|
||||||
@ -166,11 +169,11 @@ def fetch_page(link:str)->(str,str):
|
|||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
good = False
|
good = False
|
||||||
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
||||||
elif response.data is None or len(response.data) < MINFILESIZE:
|
elif response.data is None or len(response.data) < MIN_FILE_SIZE:
|
||||||
LOGGER.error('too small/incorrect for URL %s', link)
|
LOGGER.error('too small/incorrect for URL %s', link)
|
||||||
good = False
|
good = False
|
||||||
# raise error instead?
|
# raise error instead?
|
||||||
elif len(response.data) > MAXFILESIZE:
|
elif len(response.data) > MAX_FILE_SIZE:
|
||||||
good = False
|
good = False
|
||||||
LOGGER.error('too large: length %s for URL %s', len(response.data), link)
|
LOGGER.error('too large: length %s for URL %s', len(response.data), link)
|
||||||
if good:
|
if good:
|
||||||
@ -202,7 +205,7 @@ def extract_page(final_link,html):
|
|||||||
if html is not None:
|
if html is not None:
|
||||||
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
|
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
|
||||||
if doc is not None:
|
if doc is not None:
|
||||||
if not "text" in doc or len(doc["text"]) < MINTEXTSIZE:
|
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
|
||||||
# text too small
|
# text too small
|
||||||
doc = None
|
doc = None
|
||||||
return doc
|
return doc
|
||||||
@ -243,14 +246,13 @@ def index_page(db,original_link,final_link,html,doc):
|
|||||||
set_content_checksums(doc)
|
set_content_checksums(doc)
|
||||||
tsz = doc["text_size"]
|
tsz = doc["text_size"]
|
||||||
psz = doc["paragraph_sizes_sum"]
|
psz = doc["paragraph_sizes_sum"]
|
||||||
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
|
if tsz < MIN_TEXT_SIZE or psz/tsz < TEXT_TRASH_RATIO:
|
||||||
state = "small"
|
state = "small"
|
||||||
# check copy
|
# check copy
|
||||||
if state == "good":
|
if state == "good":
|
||||||
origsz = 0
|
origsz = 0
|
||||||
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
||||||
# index paragraph checksums
|
# index paragraph checksums
|
||||||
print(checkcol)
|
|
||||||
nd = checkcol.find_one({"_id":chs})
|
nd = checkcol.find_one({"_id":chs})
|
||||||
if nd is None:
|
if nd is None:
|
||||||
origsz += paragraph_size
|
origsz += paragraph_size
|
||||||
@ -258,7 +260,6 @@ def index_page(db,original_link,final_link,html,doc):
|
|||||||
|
|
||||||
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
||||||
state = "copy"
|
state = "copy"
|
||||||
print(origsz)
|
|
||||||
if state == "good":
|
if state == "good":
|
||||||
htdoc = get_link_doc(link,state)
|
htdoc = get_link_doc(link,state)
|
||||||
htdoc["html"] = html
|
htdoc["html"] = html
|
||||||
@ -675,7 +676,7 @@ def classify(start_link):
|
|||||||
def visit(hostname):
|
def visit(hostname):
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
batch_size = BATCHSIZE
|
batch_size = BATCH_SIZE
|
||||||
rules = fetch_robot(hostname)
|
rules = fetch_robot(hostname)
|
||||||
start_link = "https://" + hostname
|
start_link = "https://" + hostname
|
||||||
# renew front links
|
# renew front links
|
||||||
@ -730,7 +731,7 @@ def crawl_summary():
|
|||||||
{"$sort":{"original_text_size":-1}},
|
{"$sort":{"original_text_size":-1}},
|
||||||
])
|
])
|
||||||
print(">>>> Batches")
|
print(">>>> Batches")
|
||||||
headers = ["_id","document_count","good_document_count","batch_count","text_size","original_text_size"]
|
headers = ["_id","document_count","good_document_count","batch_size","original_text_size"]
|
||||||
print("\t".join(headers))
|
print("\t".join(headers))
|
||||||
for item in res:
|
for item in res:
|
||||||
values = [str(item[x]) for x in headers]
|
values = [str(item[x]) for x in headers]
|
||||||
@ -761,7 +762,7 @@ def sample_domains():
|
|||||||
all_domains = []
|
all_domains = []
|
||||||
for domain in domains:
|
for domain in domains:
|
||||||
all_domains.append(domain)
|
all_domains.append(domain)
|
||||||
sample_size = min(int(DISCOVER_LINK_RATIO* BATCHSIZE), len(all_domains))
|
sample_size = min(int(DISCOVER_LINK_RATIO* BATCH_SIZE), len(all_domains))
|
||||||
print(">>> Discover domains {}".format(sample_size))
|
print(">>> Discover domains {}".format(sample_size))
|
||||||
sample_domains = random.sample(all_domains,sample_size)
|
sample_domains = random.sample(all_domains,sample_size)
|
||||||
domaincol = db["domains"]
|
domaincol = db["domains"]
|
||||||
@ -770,7 +771,7 @@ def sample_domains():
|
|||||||
all_domains = []
|
all_domains = []
|
||||||
for item in res:
|
for item in res:
|
||||||
all_domains.append(item["host"])
|
all_domains.append(item["host"])
|
||||||
sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCHSIZE),len(all_domains))
|
sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCH_SIZE),len(all_domains))
|
||||||
print(">>>> Best domains {}".format(sample_size))
|
print(">>>> Best domains {}".format(sample_size))
|
||||||
sample_domains += random.sample(all_domains,sample_size)
|
sample_domains += random.sample(all_domains,sample_size)
|
||||||
for domain in sample_domains:
|
for domain in sample_domains:
|
||||||
|
Loading…
Reference in New Issue
Block a user