zz
This commit is contained in:
parent
000490bf73
commit
d66060d8e6
@ -169,7 +169,7 @@ def extract_pages(link_batch:list,responses:list)->list:
|
|||||||
out.append((original_link,final_link,html,doc))
|
out.append((original_link,final_link,html,doc))
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def set_content_checksum(doc):
|
def set_content_checksums(doc):
|
||||||
text = doc["text"]
|
text = doc["text"]
|
||||||
checksums,sizes = calculate_checksums(text)
|
checksums,sizes = calculate_checksums(text)
|
||||||
doc["text_size"] = len(text)
|
doc["text_size"] = len(text)
|
||||||
@ -191,7 +191,14 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
contentcol = db["content"]
|
contentcol = db["content"]
|
||||||
checkcol = db["check"]
|
checkcol = db["check"]
|
||||||
links = []
|
links = []
|
||||||
|
# stats of the batch
|
||||||
|
good_document_count = 0
|
||||||
|
document_count = 0
|
||||||
|
text_size = 0
|
||||||
|
good_text_size = 0
|
||||||
|
original_text_size = 0
|
||||||
for original_link,final_link,html,doc in extracted_pages:
|
for original_link,final_link,html,doc in extracted_pages:
|
||||||
|
document_count += 1
|
||||||
state = "good"
|
state = "good"
|
||||||
link = original_link
|
link = original_link
|
||||||
if original_link != final_link:
|
if original_link != final_link:
|
||||||
@ -205,9 +212,10 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
if doc is not None:
|
if doc is not None:
|
||||||
set_content_checksums(doc)
|
set_content_checksums(doc)
|
||||||
tsz = doc["text_size"]
|
tsz = doc["text_size"]
|
||||||
|
text_size += tsz
|
||||||
psz = doc["paragraph_sizes_sum"]
|
psz = doc["paragraph_sizes_sum"]
|
||||||
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
|
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
|
||||||
state = "trash"
|
state = "small"
|
||||||
# check copy
|
# check copy
|
||||||
if state == "good":
|
if state == "good":
|
||||||
origsz = 0
|
origsz = 0
|
||||||
@ -219,8 +227,11 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
|
|
||||||
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
||||||
state = "copy"
|
state = "copy"
|
||||||
print(copysz)
|
original_text_size += origsz
|
||||||
|
print(origsz)
|
||||||
if state == "good":
|
if state == "good":
|
||||||
|
good_document_count += 1
|
||||||
|
good_text_size += doc["text_size"]
|
||||||
htdoc = get_link_doc(link,state)
|
htdoc = get_link_doc(link,state)
|
||||||
htdoc["html"] = html
|
htdoc["html"] = html
|
||||||
htdoc["html_size"] = len(html)
|
htdoc["html_size"] = len(html)
|
||||||
@ -242,6 +253,20 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
linkdoc = get_link_doc(link,state)
|
linkdoc = get_link_doc(link,state)
|
||||||
del linkdoc["url"]
|
del linkdoc["url"]
|
||||||
linkcol.update_one({"url":link},{"$set":linkdoc})
|
linkcol.update_one({"url":link},{"$set":linkdoc})
|
||||||
|
batchdoc = {
|
||||||
|
"host": linkdoc["host"],
|
||||||
|
"domain": linkdoc["domain"],
|
||||||
|
"created_at": datetime.utcnow(),
|
||||||
|
"good_document_count":good_document_count,
|
||||||
|
"document_count":document_count,
|
||||||
|
"text_size":text_size,
|
||||||
|
"good_text_size":good_text_size,
|
||||||
|
"original_text_size":original_text_size,
|
||||||
|
"batch_size": BATCHSIZE,
|
||||||
|
"average_fetch_characters": text_size / BATCHSIZE,
|
||||||
|
}
|
||||||
|
db["batches"].insert_one(batchdoc)
|
||||||
|
print(batchdoc)
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
@ -267,12 +292,11 @@ def get_bs_links(link,html):
|
|||||||
netloc = parsed.netloc
|
netloc = parsed.netloc
|
||||||
path = os.path.normpath(parsed.path)
|
path = os.path.normpath(parsed.path)
|
||||||
scheme = parsed.scheme
|
scheme = parsed.scheme
|
||||||
|
# internal link
|
||||||
if parsed.netloc == "":
|
if parsed.netloc == "":
|
||||||
scheme = base.scheme
|
scheme = base.scheme
|
||||||
if parsed.path == "/":
|
netloc = base.netloc
|
||||||
netloc = base.netloc
|
if not parsed.path.startswith("/"):
|
||||||
else:
|
|
||||||
netloc = base.netloc
|
|
||||||
path = os.path.normpath(base.path +"/" + path)
|
path = os.path.normpath(base.path +"/" + path)
|
||||||
if not scheme.startswith("http"):
|
if not scheme.startswith("http"):
|
||||||
continue
|
continue
|
||||||
@ -304,8 +328,6 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat
|
|||||||
continue
|
continue
|
||||||
internal_links, external_links = get_bs_links(final_link,html)
|
internal_links, external_links = get_bs_links(final_link,html)
|
||||||
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
||||||
for link in external_links:
|
|
||||||
links[link] = "frontlink"
|
|
||||||
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
|
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
|
||||||
#print(extracted_links)
|
#print(extracted_links)
|
||||||
for link in internal_links:
|
for link in internal_links:
|
||||||
@ -547,9 +569,9 @@ def sample_links(db,hostname,status,batch_size):
|
|||||||
trainset,testset = split_train(crawled_links)
|
trainset,testset = split_train(crawled_links)
|
||||||
cl.train(trainset)
|
cl.train(trainset)
|
||||||
prediction_accuracy = cl.test(testset)
|
prediction_accuracy = cl.test(testset)
|
||||||
|
|
||||||
sample_set_size = SAMPLE_SET_SIZE
|
sample_set_size = SAMPLE_SET_SIZE
|
||||||
res = linkcol.find({"host":hostname,"status": status})
|
res = linkcol.find({"host":hostname,"status": status})
|
||||||
sample_links = []
|
|
||||||
predicted_good = 0
|
predicted_good = 0
|
||||||
visitcounter = collections.Counter()
|
visitcounter = collections.Counter()
|
||||||
good_links = []
|
good_links = []
|
||||||
@ -567,7 +589,7 @@ def sample_links(db,hostname,status,batch_size):
|
|||||||
visitcounter[feature] += 1
|
visitcounter[feature] += 1
|
||||||
mls = int(min(batch_size*(1- DISCOVER_LINK_RATIO),len(good_links)))
|
mls = int(min(batch_size*(1- DISCOVER_LINK_RATIO),len(good_links)))
|
||||||
random.shuffle(good_links)
|
random.shuffle(good_links)
|
||||||
links = good_links[0:mls]
|
links = list(good_links[0:mls])
|
||||||
numdiscover = len(discover_links)
|
numdiscover = len(discover_links)
|
||||||
eval_discover_links = []
|
eval_discover_links = []
|
||||||
for link in discover_links:
|
for link in discover_links:
|
||||||
@ -582,7 +604,7 @@ def sample_links(db,hostname,status,batch_size):
|
|||||||
#print(eval_discover_links)
|
#print(eval_discover_links)
|
||||||
mls = int(min(batch_size * DISCOVER_LINK_RATIO,len(eval_discover_links)))
|
mls = int(min(batch_size * DISCOVER_LINK_RATIO,len(eval_discover_links)))
|
||||||
links += [l[0] for l in eval_discover_links[0:mls]]
|
links += [l[0] for l in eval_discover_links[0:mls]]
|
||||||
return links
|
return list(set(links))
|
||||||
|
|
||||||
def domain_summary(db,hostname):
|
def domain_summary(db,hostname):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
@ -613,6 +635,9 @@ def createdb():
|
|||||||
domaincol = db["domains"]
|
domaincol = db["domains"]
|
||||||
domaincol.create_index("host",unique=True)
|
domaincol.create_index("host",unique=True)
|
||||||
domaincol.create_index("average_fetch_characters",unique=True)
|
domaincol.create_index("average_fetch_characters",unique=True)
|
||||||
|
batchcol = db["batches"]
|
||||||
|
batchcol.create_index("host")
|
||||||
|
batchcol.create_index("created_at")
|
||||||
|
|
||||||
def parseurl(link):
|
def parseurl(link):
|
||||||
link,hostname = courlan.check_url(link)
|
link,hostname = courlan.check_url(link)
|
||||||
@ -628,6 +653,11 @@ def parseurl(link):
|
|||||||
doc = trafilatura.bare_extraction(html)
|
doc = trafilatura.bare_extraction(html)
|
||||||
import pprint
|
import pprint
|
||||||
pprint.pprint(doc)
|
pprint.pprint(doc)
|
||||||
|
internal_links, external_links = get_bs_links(link,html)
|
||||||
|
print(internal_links)
|
||||||
|
print(external_links)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def externaldomains(link):
|
def externaldomains(link):
|
||||||
html = trafilatura.fetch_url(link,decode=True)
|
html = trafilatura.fetch_url(link,decode=True)
|
||||||
@ -666,7 +696,10 @@ def visit(start_link):
|
|||||||
# start crawling
|
# start crawling
|
||||||
# frontlinks first
|
# frontlinks first
|
||||||
links = sample_links(db,hostname,"frontlink",batch_size)
|
links = sample_links(db,hostname,"frontlink",batch_size)
|
||||||
links.insert(0,start_link)
|
if start_link not in links:
|
||||||
|
links.insert(0,start_link)
|
||||||
|
print("sampled")
|
||||||
|
print(links)
|
||||||
# index results
|
# index results
|
||||||
print("Processing links")
|
print("Processing links")
|
||||||
responses = []
|
responses = []
|
||||||
|
@ -4,4 +4,5 @@ courlan
|
|||||||
pymongo
|
pymongo
|
||||||
click
|
click
|
||||||
lxml
|
lxml
|
||||||
|
bs4
|
||||||
rq
|
rq
|
||||||
|
Loading…
Reference in New Issue
Block a user