This commit is contained in:
Daniel Hládek 2023-04-11 13:41:28 +02:00
parent 000490bf73
commit d66060d8e6
2 changed files with 47 additions and 13 deletions

View File

@ -169,7 +169,7 @@ def extract_pages(link_batch:list,responses:list)->list:
out.append((original_link,final_link,html,doc)) out.append((original_link,final_link,html,doc))
return out return out
def set_content_checksum(doc): def set_content_checksums(doc):
text = doc["text"] text = doc["text"]
checksums,sizes = calculate_checksums(text) checksums,sizes = calculate_checksums(text)
doc["text_size"] = len(text) doc["text_size"] = len(text)
@ -191,7 +191,14 @@ def index_pages(db,hostname,extracted_pages):
contentcol = db["content"] contentcol = db["content"]
checkcol = db["check"] checkcol = db["check"]
links = [] links = []
# stats of the batch
good_document_count = 0
document_count = 0
text_size = 0
good_text_size = 0
original_text_size = 0
for original_link,final_link,html,doc in extracted_pages: for original_link,final_link,html,doc in extracted_pages:
document_count += 1
state = "good" state = "good"
link = original_link link = original_link
if original_link != final_link: if original_link != final_link:
@ -205,9 +212,10 @@ def index_pages(db,hostname,extracted_pages):
if doc is not None: if doc is not None:
set_content_checksums(doc) set_content_checksums(doc)
tsz = doc["text_size"] tsz = doc["text_size"]
text_size += tsz
psz = doc["paragraph_sizes_sum"] psz = doc["paragraph_sizes_sum"]
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO: if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
state = "trash" state = "small"
# check copy # check copy
if state == "good": if state == "good":
origsz = 0 origsz = 0
@ -219,8 +227,11 @@ def index_pages(db,hostname,extracted_pages):
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO: if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
state = "copy" state = "copy"
print(copysz) original_text_size += origsz
print(origsz)
if state == "good": if state == "good":
good_document_count += 1
good_text_size += doc["text_size"]
htdoc = get_link_doc(link,state) htdoc = get_link_doc(link,state)
htdoc["html"] = html htdoc["html"] = html
htdoc["html_size"] = len(html) htdoc["html_size"] = len(html)
@ -242,6 +253,20 @@ def index_pages(db,hostname,extracted_pages):
linkdoc = get_link_doc(link,state) linkdoc = get_link_doc(link,state)
del linkdoc["url"] del linkdoc["url"]
linkcol.update_one({"url":link},{"$set":linkdoc}) linkcol.update_one({"url":link},{"$set":linkdoc})
batchdoc = {
"host": linkdoc["host"],
"domain": linkdoc["domain"],
"created_at": datetime.utcnow(),
"good_document_count":good_document_count,
"document_count":document_count,
"text_size":text_size,
"good_text_size":good_text_size,
"original_text_size":original_text_size,
"batch_size": BATCHSIZE,
"average_fetch_characters": text_size / BATCHSIZE,
}
db["batches"].insert_one(batchdoc)
print(batchdoc)
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib.parse import urllib.parse
@ -267,12 +292,11 @@ def get_bs_links(link,html):
netloc = parsed.netloc netloc = parsed.netloc
path = os.path.normpath(parsed.path) path = os.path.normpath(parsed.path)
scheme = parsed.scheme scheme = parsed.scheme
# internal link
if parsed.netloc == "": if parsed.netloc == "":
scheme = base.scheme scheme = base.scheme
if parsed.path == "/":
netloc = base.netloc
else:
netloc = base.netloc netloc = base.netloc
if not parsed.path.startswith("/"):
path = os.path.normpath(base.path +"/" + path) path = os.path.normpath(base.path +"/" + path)
if not scheme.startswith("http"): if not scheme.startswith("http"):
continue continue
@ -304,8 +328,6 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat
continue continue
internal_links, external_links = get_bs_links(final_link,html) internal_links, external_links = get_bs_links(final_link,html)
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE) #external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
for link in external_links:
links[link] = "frontlink"
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE) #internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
#print(extracted_links) #print(extracted_links)
for link in internal_links: for link in internal_links:
@ -547,9 +569,9 @@ def sample_links(db,hostname,status,batch_size):
trainset,testset = split_train(crawled_links) trainset,testset = split_train(crawled_links)
cl.train(trainset) cl.train(trainset)
prediction_accuracy = cl.test(testset) prediction_accuracy = cl.test(testset)
sample_set_size = SAMPLE_SET_SIZE sample_set_size = SAMPLE_SET_SIZE
res = linkcol.find({"host":hostname,"status": status}) res = linkcol.find({"host":hostname,"status": status})
sample_links = []
predicted_good = 0 predicted_good = 0
visitcounter = collections.Counter() visitcounter = collections.Counter()
good_links = [] good_links = []
@ -567,7 +589,7 @@ def sample_links(db,hostname,status,batch_size):
visitcounter[feature] += 1 visitcounter[feature] += 1
mls = int(min(batch_size*(1- DISCOVER_LINK_RATIO),len(good_links))) mls = int(min(batch_size*(1- DISCOVER_LINK_RATIO),len(good_links)))
random.shuffle(good_links) random.shuffle(good_links)
links = good_links[0:mls] links = list(good_links[0:mls])
numdiscover = len(discover_links) numdiscover = len(discover_links)
eval_discover_links = [] eval_discover_links = []
for link in discover_links: for link in discover_links:
@ -582,7 +604,7 @@ def sample_links(db,hostname,status,batch_size):
#print(eval_discover_links) #print(eval_discover_links)
mls = int(min(batch_size * DISCOVER_LINK_RATIO,len(eval_discover_links))) mls = int(min(batch_size * DISCOVER_LINK_RATIO,len(eval_discover_links)))
links += [l[0] for l in eval_discover_links[0:mls]] links += [l[0] for l in eval_discover_links[0:mls]]
return links return list(set(links))
def domain_summary(db,hostname): def domain_summary(db,hostname):
linkcol = db["links"] linkcol = db["links"]
@ -613,6 +635,9 @@ def createdb():
domaincol = db["domains"] domaincol = db["domains"]
domaincol.create_index("host",unique=True) domaincol.create_index("host",unique=True)
domaincol.create_index("average_fetch_characters",unique=True) domaincol.create_index("average_fetch_characters",unique=True)
batchcol = db["batches"]
batchcol.create_index("host")
batchcol.create_index("created_at")
def parseurl(link): def parseurl(link):
link,hostname = courlan.check_url(link) link,hostname = courlan.check_url(link)
@ -628,6 +653,11 @@ def parseurl(link):
doc = trafilatura.bare_extraction(html) doc = trafilatura.bare_extraction(html)
import pprint import pprint
pprint.pprint(doc) pprint.pprint(doc)
internal_links, external_links = get_bs_links(link,html)
print(internal_links)
print(external_links)
def externaldomains(link): def externaldomains(link):
html = trafilatura.fetch_url(link,decode=True) html = trafilatura.fetch_url(link,decode=True)
@ -666,7 +696,10 @@ def visit(start_link):
# start crawling # start crawling
# frontlinks first # frontlinks first
links = sample_links(db,hostname,"frontlink",batch_size) links = sample_links(db,hostname,"frontlink",batch_size)
if start_link not in links:
links.insert(0,start_link) links.insert(0,start_link)
print("sampled")
print(links)
# index results # index results
print("Processing links") print("Processing links")
responses = [] responses = []

View File

@ -4,4 +4,5 @@ courlan
pymongo pymongo
click click
lxml lxml
bs4
rq rq