zz
This commit is contained in:
parent
000490bf73
commit
d66060d8e6
@ -169,7 +169,7 @@ def extract_pages(link_batch:list,responses:list)->list:
|
||||
out.append((original_link,final_link,html,doc))
|
||||
return out
|
||||
|
||||
def set_content_checksum(doc):
|
||||
def set_content_checksums(doc):
|
||||
text = doc["text"]
|
||||
checksums,sizes = calculate_checksums(text)
|
||||
doc["text_size"] = len(text)
|
||||
@ -191,7 +191,14 @@ def index_pages(db,hostname,extracted_pages):
|
||||
contentcol = db["content"]
|
||||
checkcol = db["check"]
|
||||
links = []
|
||||
# stats of the batch
|
||||
good_document_count = 0
|
||||
document_count = 0
|
||||
text_size = 0
|
||||
good_text_size = 0
|
||||
original_text_size = 0
|
||||
for original_link,final_link,html,doc in extracted_pages:
|
||||
document_count += 1
|
||||
state = "good"
|
||||
link = original_link
|
||||
if original_link != final_link:
|
||||
@ -205,9 +212,10 @@ def index_pages(db,hostname,extracted_pages):
|
||||
if doc is not None:
|
||||
set_content_checksums(doc)
|
||||
tsz = doc["text_size"]
|
||||
text_size += tsz
|
||||
psz = doc["paragraph_sizes_sum"]
|
||||
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
|
||||
state = "trash"
|
||||
state = "small"
|
||||
# check copy
|
||||
if state == "good":
|
||||
origsz = 0
|
||||
@ -219,8 +227,11 @@ def index_pages(db,hostname,extracted_pages):
|
||||
|
||||
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
||||
state = "copy"
|
||||
print(copysz)
|
||||
original_text_size += origsz
|
||||
print(origsz)
|
||||
if state == "good":
|
||||
good_document_count += 1
|
||||
good_text_size += doc["text_size"]
|
||||
htdoc = get_link_doc(link,state)
|
||||
htdoc["html"] = html
|
||||
htdoc["html_size"] = len(html)
|
||||
@ -242,6 +253,20 @@ def index_pages(db,hostname,extracted_pages):
|
||||
linkdoc = get_link_doc(link,state)
|
||||
del linkdoc["url"]
|
||||
linkcol.update_one({"url":link},{"$set":linkdoc})
|
||||
batchdoc = {
|
||||
"host": linkdoc["host"],
|
||||
"domain": linkdoc["domain"],
|
||||
"created_at": datetime.utcnow(),
|
||||
"good_document_count":good_document_count,
|
||||
"document_count":document_count,
|
||||
"text_size":text_size,
|
||||
"good_text_size":good_text_size,
|
||||
"original_text_size":original_text_size,
|
||||
"batch_size": BATCHSIZE,
|
||||
"average_fetch_characters": text_size / BATCHSIZE,
|
||||
}
|
||||
db["batches"].insert_one(batchdoc)
|
||||
print(batchdoc)
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib.parse
|
||||
@ -267,12 +292,11 @@ def get_bs_links(link,html):
|
||||
netloc = parsed.netloc
|
||||
path = os.path.normpath(parsed.path)
|
||||
scheme = parsed.scheme
|
||||
# internal link
|
||||
if parsed.netloc == "":
|
||||
scheme = base.scheme
|
||||
if parsed.path == "/":
|
||||
netloc = base.netloc
|
||||
else:
|
||||
netloc = base.netloc
|
||||
if not parsed.path.startswith("/"):
|
||||
path = os.path.normpath(base.path +"/" + path)
|
||||
if not scheme.startswith("http"):
|
||||
continue
|
||||
@ -304,8 +328,6 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat
|
||||
continue
|
||||
internal_links, external_links = get_bs_links(final_link,html)
|
||||
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
||||
for link in external_links:
|
||||
links[link] = "frontlink"
|
||||
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
|
||||
#print(extracted_links)
|
||||
for link in internal_links:
|
||||
@ -547,9 +569,9 @@ def sample_links(db,hostname,status,batch_size):
|
||||
trainset,testset = split_train(crawled_links)
|
||||
cl.train(trainset)
|
||||
prediction_accuracy = cl.test(testset)
|
||||
|
||||
sample_set_size = SAMPLE_SET_SIZE
|
||||
res = linkcol.find({"host":hostname,"status": status})
|
||||
sample_links = []
|
||||
predicted_good = 0
|
||||
visitcounter = collections.Counter()
|
||||
good_links = []
|
||||
@ -567,7 +589,7 @@ def sample_links(db,hostname,status,batch_size):
|
||||
visitcounter[feature] += 1
|
||||
mls = int(min(batch_size*(1- DISCOVER_LINK_RATIO),len(good_links)))
|
||||
random.shuffle(good_links)
|
||||
links = good_links[0:mls]
|
||||
links = list(good_links[0:mls])
|
||||
numdiscover = len(discover_links)
|
||||
eval_discover_links = []
|
||||
for link in discover_links:
|
||||
@ -582,7 +604,7 @@ def sample_links(db,hostname,status,batch_size):
|
||||
#print(eval_discover_links)
|
||||
mls = int(min(batch_size * DISCOVER_LINK_RATIO,len(eval_discover_links)))
|
||||
links += [l[0] for l in eval_discover_links[0:mls]]
|
||||
return links
|
||||
return list(set(links))
|
||||
|
||||
def domain_summary(db,hostname):
|
||||
linkcol = db["links"]
|
||||
@ -613,6 +635,9 @@ def createdb():
|
||||
domaincol = db["domains"]
|
||||
domaincol.create_index("host",unique=True)
|
||||
domaincol.create_index("average_fetch_characters",unique=True)
|
||||
batchcol = db["batches"]
|
||||
batchcol.create_index("host")
|
||||
batchcol.create_index("created_at")
|
||||
|
||||
def parseurl(link):
|
||||
link,hostname = courlan.check_url(link)
|
||||
@ -628,6 +653,11 @@ def parseurl(link):
|
||||
doc = trafilatura.bare_extraction(html)
|
||||
import pprint
|
||||
pprint.pprint(doc)
|
||||
internal_links, external_links = get_bs_links(link,html)
|
||||
print(internal_links)
|
||||
print(external_links)
|
||||
|
||||
|
||||
|
||||
def externaldomains(link):
|
||||
html = trafilatura.fetch_url(link,decode=True)
|
||||
@ -666,7 +696,10 @@ def visit(start_link):
|
||||
# start crawling
|
||||
# frontlinks first
|
||||
links = sample_links(db,hostname,"frontlink",batch_size)
|
||||
if start_link not in links:
|
||||
links.insert(0,start_link)
|
||||
print("sampled")
|
||||
print(links)
|
||||
# index results
|
||||
print("Processing links")
|
||||
responses = []
|
||||
|
@ -4,4 +4,5 @@ courlan
|
||||
pymongo
|
||||
click
|
||||
lxml
|
||||
bs4
|
||||
rq
|
||||
|
Loading…
Reference in New Issue
Block a user