This commit is contained in:
Daniel Hládek 2023-04-17 15:07:58 +02:00
parent 34fc9f9124
commit 1801f01a99
2 changed files with 12 additions and 10 deletions

View File

@ -32,9 +32,10 @@ def classify(start_link):
mongocrawler.classify(start_link) mongocrawler.classify(start_link)
@cli.command() @cli.command()
@click.argument("hostname") @click.argument("hostname",help="Hostname to crawl")
def visit(hostname): @click.option("--filter_content",default=True,help="Filter content")
mongocrawler.visit(hostname) def visit(hostname,filter_content=True):
mongocrawler.visit(hostname,filter_content=filter_content)
@cli.command() @cli.command()
def summary(): def summary():

View File

@ -24,6 +24,8 @@ import hashlib
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib.parse import urllib.parse
import os.path import os.path
import binascii
import json
# database options # database options
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
@ -166,6 +168,7 @@ def fetch_page(link:str)->(str,str):
html = None html = None
if response is not None : if response is not None :
good = True good = True
print(response)
if response.status != 200: if response.status != 200:
good = False good = False
LOGGER.error('not a 200 response: %s for URL %s', response.status, url) LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
@ -227,7 +230,7 @@ def set_content_checksums(doc):
sentences += 1 sentences += 1
doc["sentences_count"] = sentences doc["sentences_count"] = sentences
def index_page(db,original_link,final_link,html,doc): def index_page(db,original_link,final_link,html,doc,filter_content=True):
linkcol = db["links"] linkcol = db["links"]
htmlcol = db["html"] htmlcol = db["html"]
contentcol = db["content"] contentcol = db["content"]
@ -246,7 +249,7 @@ def index_page(db,original_link,final_link,html,doc):
set_content_checksums(doc) set_content_checksums(doc)
tsz = doc["text_size"] tsz = doc["text_size"]
psz = doc["paragraph_sizes_sum"] psz = doc["paragraph_sizes_sum"]
if tsz < MIN_TEXT_SIZE or psz/tsz < TEXT_TRASH_RATIO: if filter_content and (tsz < MIN_TEXT_SIZE or psz/tsz < TEXT_TRASH_RATIO):
state = "small" state = "small"
# check copy # check copy
if state == "good": if state == "good":
@ -258,7 +261,7 @@ def index_page(db,original_link,final_link,html,doc):
origsz += paragraph_size origsz += paragraph_size
doc["original_text_size"] = origsz doc["original_text_size"] = origsz
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO: if filter_content and (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
state = "copy" state = "copy"
if state == "good": if state == "good":
htdoc = get_link_doc(link,state) htdoc = get_link_doc(link,state)
@ -673,7 +676,7 @@ def classify(start_link):
cl.train(trainset) cl.train(trainset)
cl.test(testset) cl.test(testset)
def visit(hostname): def visit(hostname,filter_content=True):
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
batch_size = BATCH_SIZE batch_size = BATCH_SIZE
@ -707,7 +710,7 @@ def visit(hostname):
final_states = [] final_states = []
docs = [] docs = []
for original_link,final_link,html,doc in extracted_pages: for original_link,final_link,html,doc in extracted_pages:
status = index_page(db,original_link,final_link,html,doc) status = index_page(db,original_link,final_link,html,doc,filter_content)
final_states.append(status) final_states.append(status)
docs.append(doc) docs.append(doc)
save_batch_info(db,hostname,final_states,docs) save_batch_info(db,hostname,final_states,docs)
@ -737,8 +740,6 @@ def crawl_summary():
values = [str(item[x]) for x in headers] values = [str(item[x]) for x in headers]
print("\t".join(values)) print("\t".join(values))
import binascii
import json
def import_html(): def import_html():
myclient= pymongo.MongoClient(CONNECTION) myclient= pymongo.MongoClient(CONNECTION)