Compare commits
No commits in common. "01645b8862eec6b588af4159b268fcfca644f979" and "1546a63b75f9a605a5f4b6a9217adebd4ade12ee" have entirely different histories.
01645b8862
...
1546a63b75
@ -32,10 +32,9 @@ def classify(start_link):
|
||||
mongocrawler.classify(start_link)
|
||||
|
||||
@cli.command()
|
||||
@click.argument("hostname",help="Hostname to crawl")
|
||||
@click.option("--filter_content",default=True,help="Filter content")
|
||||
def visit(hostname,filter_content=True):
|
||||
mongocrawler.visit(hostname,filter_content=filter_content)
|
||||
@click.argument("hostname")
|
||||
def visit(hostname):
|
||||
mongocrawler.visit(hostname)
|
||||
|
||||
@cli.command()
|
||||
def summary():
|
||||
|
@ -24,28 +24,24 @@ import hashlib
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib.parse
|
||||
import os.path
|
||||
import binascii
|
||||
import json
|
||||
|
||||
# database options
|
||||
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
||||
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
||||
# retrieving filter
|
||||
BATCH_SIZE = int(os.getenv("SUCKER_BATCH_SIZE","10"))
|
||||
MIN_FILE_SIZE=int(os.getenv("SUCKER_MIN_FILE_SIZE","300"))
|
||||
MAX_FILE_SIZE=int(os.getenv("SUCKER_MAX_FILE_SIZE","10000000"))
|
||||
# document originality filter
|
||||
MIN_TEXT_SIZE=int(os.getenv("SUCKER_MIN_TEXT_SIZE","200"))
|
||||
CHECK_PARAGRAPH_SIZE=int(os.getenv("SUCKER_CHECK_PARAGRAPH_SIZE","150"))
|
||||
TEXT_TRASH_RATIO=float(os.getenv("SUCKER_TEXT_TRASH_RATIO","0.6"))
|
||||
# link and domain sampling
|
||||
DISCOVER_LINK_RATIO = float(os.getenv("SUCKER_DISCOVER_LINK_RATIO","0.3"))
|
||||
SAMPLE_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","10000"))
|
||||
CLASSIFIER_SET_SIZE = int(os.getenv("SUCKER_DISCOVER_LINK_RATIO","200"))
|
||||
# link filter
|
||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
||||
STOP_PATHS=os.getenv("SUCKER_STOP_PATHS","xml,rss,login,admin").split(",")
|
||||
BATCHSIZE=int(os.getenv("SUCKER_BATCHSIZE","10"))
|
||||
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
||||
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
||||
MINFILESIZE=300
|
||||
MAXFILESIZE=10000000
|
||||
MINTEXTSIZE=200
|
||||
CHECK_PARAGRAPH_SIZE=200
|
||||
TEXT_TRASH_SIZE=200
|
||||
TEXT_TRASH_RATIO=0.6
|
||||
DISCOVER_LINK_RATIO = 0.3
|
||||
DISCOVER_DOMAIN_RATIO = 0.5
|
||||
SAMPLE_SET_SIZE =10000
|
||||
CLASSIFIER_SET_SIZE = 200
|
||||
STOP_PATHS=["xml","rss","login","admin"]
|
||||
|
||||
|
||||
def get_bs_links(link,html):
|
||||
# Extrakcia linkov zo stranky
|
||||
@ -169,15 +165,14 @@ def fetch_page(link:str)->(str,str):
|
||||
html = None
|
||||
if response is not None :
|
||||
good = True
|
||||
print(response)
|
||||
if response.status != 200:
|
||||
good = False
|
||||
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
||||
elif response.data is None or len(response.data) < MIN_FILE_SIZE:
|
||||
elif response.data is None or len(response.data) < MINFILESIZE:
|
||||
LOGGER.error('too small/incorrect for URL %s', link)
|
||||
good = False
|
||||
# raise error instead?
|
||||
elif len(response.data) > MAX_FILE_SIZE:
|
||||
elif len(response.data) > MAXFILESIZE:
|
||||
good = False
|
||||
LOGGER.error('too large: length %s for URL %s', len(response.data), link)
|
||||
if good:
|
||||
@ -209,7 +204,7 @@ def extract_page(final_link,html):
|
||||
if html is not None:
|
||||
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=False,target_language=LANGUAGE,favor_precision=True)
|
||||
if doc is not None:
|
||||
if not "text" in doc or len(doc["text"]) < MIN_TEXT_SIZE:
|
||||
if not "text" in doc or len(doc["text"]) < MINTEXTSIZE:
|
||||
# text too small
|
||||
doc = None
|
||||
return doc
|
||||
@ -231,7 +226,7 @@ def set_content_checksums(doc):
|
||||
sentences += 1
|
||||
doc["sentences_count"] = sentences
|
||||
|
||||
def index_page(db,original_link,final_link,html,doc,filter_content=True):
|
||||
def index_page(db,original_link,final_link,html,doc):
|
||||
linkcol = db["links"]
|
||||
htmlcol = db["html"]
|
||||
contentcol = db["content"]
|
||||
@ -249,7 +244,7 @@ def index_page(db,original_link,final_link,html,doc,filter_content=True):
|
||||
set_content_checksums(doc)
|
||||
tsz = doc["text_size"]
|
||||
psz = doc["paragraph_sizes_sum"]
|
||||
if filter_content and (tsz < MIN_TEXT_SIZE or psz/tsz < TEXT_TRASH_RATIO):
|
||||
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
|
||||
state = "small"
|
||||
# check copy
|
||||
if state == "good":
|
||||
@ -261,7 +256,7 @@ def index_page(db,original_link,final_link,html,doc,filter_content=True):
|
||||
origsz += paragraph_size
|
||||
doc["original_text_size"] = origsz
|
||||
|
||||
if filter_content and (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
||||
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
||||
state = "copy"
|
||||
if state == "good":
|
||||
htdoc = get_link_doc(link,state)
|
||||
@ -667,10 +662,10 @@ def classify(start_link):
|
||||
cl.train(trainset)
|
||||
cl.test(testset)
|
||||
|
||||
def visit(hostname,filter_content=True):
|
||||
def visit(hostname):
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
batch_size = BATCH_SIZE
|
||||
batch_size = BATCHSIZE
|
||||
rules = fetch_robot(hostname)
|
||||
start_link = "https://" + hostname
|
||||
# renew front links
|
||||
@ -701,7 +696,7 @@ def visit(hostname,filter_content=True):
|
||||
final_states = []
|
||||
docs = []
|
||||
for original_link,final_link,html,doc in extracted_pages:
|
||||
status = index_page(db,original_link,final_link,html,doc,filter_content)
|
||||
status = index_page(db,original_link,final_link,html,doc)
|
||||
final_states.append(status)
|
||||
docs.append(doc)
|
||||
save_batch_info(db,hostname,final_states,docs)
|
||||
@ -731,6 +726,8 @@ def crawl_summary():
|
||||
values = [str(item[x]) for x in headers]
|
||||
print("\t".join(values))
|
||||
|
||||
import binascii
|
||||
import json
|
||||
|
||||
def import_html():
|
||||
myclient= pymongo.MongoClient(CONNECTION)
|
||||
@ -754,7 +751,7 @@ def sample_domains():
|
||||
all_domains = []
|
||||
for domain in domains:
|
||||
all_domains.append(domain)
|
||||
sample_size = min(int(DISCOVER_LINK_RATIO* BATCH_SIZE), len(all_domains))
|
||||
sample_size = min(int(DISCOVER_DOMAIN_RATIO* BATCHSIZE), len(all_domains))
|
||||
print(">>> Discover domains {}".format(sample_size))
|
||||
sample_domains = random.sample(all_domains,sample_size)
|
||||
domaincol = db["domains"]
|
||||
@ -763,7 +760,7 @@ def sample_domains():
|
||||
all_domains = []
|
||||
for item in res:
|
||||
all_domains.append(item["host"])
|
||||
sample_size = min(int((1 - DISCOVER_LINK_RATIO) * BATCH_SIZE),len(all_domains))
|
||||
sample_size = min(int((1 - DISCOVER_DOMAIN_RATIO) * BATCHSIZE),len(all_domains))
|
||||
print(">>>> Best domains {}".format(sample_size))
|
||||
sample_domains += random.sample(all_domains,sample_size)
|
||||
for domain in sample_domains:
|
||||
|
Loading…
Reference in New Issue
Block a user