zz
This commit is contained in:
parent
34fc9f9124
commit
1801f01a99
@ -32,9 +32,10 @@ def classify(start_link):
|
|||||||
mongocrawler.classify(start_link)
|
mongocrawler.classify(start_link)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("hostname")
|
@click.argument("hostname",help="Hostname to crawl")
|
||||||
def visit(hostname):
|
@click.option("--filter_content",default=True,help="Filter content")
|
||||||
mongocrawler.visit(hostname)
|
def visit(hostname,filter_content=True):
|
||||||
|
mongocrawler.visit(hostname,filter_content=filter_content)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
def summary():
|
def summary():
|
||||||
|
@ -24,6 +24,8 @@ import hashlib
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import os.path
|
import os.path
|
||||||
|
import binascii
|
||||||
|
import json
|
||||||
|
|
||||||
# database options
|
# database options
|
||||||
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
||||||
@ -166,6 +168,7 @@ def fetch_page(link:str)->(str,str):
|
|||||||
html = None
|
html = None
|
||||||
if response is not None :
|
if response is not None :
|
||||||
good = True
|
good = True
|
||||||
|
print(response)
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
good = False
|
good = False
|
||||||
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
||||||
@ -227,7 +230,7 @@ def set_content_checksums(doc):
|
|||||||
sentences += 1
|
sentences += 1
|
||||||
doc["sentences_count"] = sentences
|
doc["sentences_count"] = sentences
|
||||||
|
|
||||||
def index_page(db,original_link,final_link,html,doc):
|
def index_page(db,original_link,final_link,html,doc,filter_content=True):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
htmlcol = db["html"]
|
htmlcol = db["html"]
|
||||||
contentcol = db["content"]
|
contentcol = db["content"]
|
||||||
@ -246,7 +249,7 @@ def index_page(db,original_link,final_link,html,doc):
|
|||||||
set_content_checksums(doc)
|
set_content_checksums(doc)
|
||||||
tsz = doc["text_size"]
|
tsz = doc["text_size"]
|
||||||
psz = doc["paragraph_sizes_sum"]
|
psz = doc["paragraph_sizes_sum"]
|
||||||
if tsz < MIN_TEXT_SIZE or psz/tsz < TEXT_TRASH_RATIO:
|
if filter_content and (tsz < MIN_TEXT_SIZE or psz/tsz < TEXT_TRASH_RATIO):
|
||||||
state = "small"
|
state = "small"
|
||||||
# check copy
|
# check copy
|
||||||
if state == "good":
|
if state == "good":
|
||||||
@ -258,7 +261,7 @@ def index_page(db,original_link,final_link,html,doc):
|
|||||||
origsz += paragraph_size
|
origsz += paragraph_size
|
||||||
doc["original_text_size"] = origsz
|
doc["original_text_size"] = origsz
|
||||||
|
|
||||||
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
if filter_content and (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
||||||
state = "copy"
|
state = "copy"
|
||||||
if state == "good":
|
if state == "good":
|
||||||
htdoc = get_link_doc(link,state)
|
htdoc = get_link_doc(link,state)
|
||||||
@ -673,7 +676,7 @@ def classify(start_link):
|
|||||||
cl.train(trainset)
|
cl.train(trainset)
|
||||||
cl.test(testset)
|
cl.test(testset)
|
||||||
|
|
||||||
def visit(hostname):
|
def visit(hostname,filter_content=True):
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
batch_size = BATCH_SIZE
|
batch_size = BATCH_SIZE
|
||||||
@ -707,7 +710,7 @@ def visit(hostname):
|
|||||||
final_states = []
|
final_states = []
|
||||||
docs = []
|
docs = []
|
||||||
for original_link,final_link,html,doc in extracted_pages:
|
for original_link,final_link,html,doc in extracted_pages:
|
||||||
status = index_page(db,original_link,final_link,html,doc)
|
status = index_page(db,original_link,final_link,html,doc,filter_content)
|
||||||
final_states.append(status)
|
final_states.append(status)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
save_batch_info(db,hostname,final_states,docs)
|
save_batch_info(db,hostname,final_states,docs)
|
||||||
@ -737,8 +740,6 @@ def crawl_summary():
|
|||||||
values = [str(item[x]) for x in headers]
|
values = [str(item[x]) for x in headers]
|
||||||
print("\t".join(values))
|
print("\t".join(values))
|
||||||
|
|
||||||
import binascii
|
|
||||||
import json
|
|
||||||
|
|
||||||
def import_html():
|
def import_html():
|
||||||
myclient= pymongo.MongoClient(CONNECTION)
|
myclient= pymongo.MongoClient(CONNECTION)
|
||||||
|
Loading…
Reference in New Issue
Block a user