Compare commits

..

No commits in common. "725b61d9bbaaa33c21e9f071bf2362eb2e0f9b81" and "000490bf73f4fc5cb8ad2023b260793cc30a576b" have entirely different histories.

3 changed files with 13 additions and 48 deletions

View File

@ -1 +0,0 @@
docker build . -t dr.kemt.fei.tuke.sk/mongocrawler:dev

View File

@ -169,7 +169,7 @@ def extract_pages(link_batch:list,responses:list)->list:
out.append((original_link,final_link,html,doc)) out.append((original_link,final_link,html,doc))
return out return out
def set_content_checksums(doc): def set_content_checksum(doc):
text = doc["text"] text = doc["text"]
checksums,sizes = calculate_checksums(text) checksums,sizes = calculate_checksums(text)
doc["text_size"] = len(text) doc["text_size"] = len(text)
@ -191,14 +191,7 @@ def index_pages(db,hostname,extracted_pages):
contentcol = db["content"] contentcol = db["content"]
checkcol = db["check"] checkcol = db["check"]
links = [] links = []
# stats of the batch
good_document_count = 0
document_count = 0
text_size = 0
good_text_size = 0
original_text_size = 0
for original_link,final_link,html,doc in extracted_pages: for original_link,final_link,html,doc in extracted_pages:
document_count += 1
state = "good" state = "good"
link = original_link link = original_link
if original_link != final_link: if original_link != final_link:
@ -212,10 +205,9 @@ def index_pages(db,hostname,extracted_pages):
if doc is not None: if doc is not None:
set_content_checksums(doc) set_content_checksums(doc)
tsz = doc["text_size"] tsz = doc["text_size"]
text_size += tsz
psz = doc["paragraph_sizes_sum"] psz = doc["paragraph_sizes_sum"]
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO: if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
state = "small" state = "trash"
# check copy # check copy
if state == "good": if state == "good":
origsz = 0 origsz = 0
@ -227,11 +219,8 @@ def index_pages(db,hostname,extracted_pages):
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO: if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
state = "copy" state = "copy"
original_text_size += origsz print(copysz)
print(origsz)
if state == "good": if state == "good":
good_document_count += 1
good_text_size += doc["text_size"]
htdoc = get_link_doc(link,state) htdoc = get_link_doc(link,state)
htdoc["html"] = html htdoc["html"] = html
htdoc["html_size"] = len(html) htdoc["html_size"] = len(html)
@ -253,20 +242,6 @@ def index_pages(db,hostname,extracted_pages):
linkdoc = get_link_doc(link,state) linkdoc = get_link_doc(link,state)
del linkdoc["url"] del linkdoc["url"]
linkcol.update_one({"url":link},{"$set":linkdoc}) linkcol.update_one({"url":link},{"$set":linkdoc})
batchdoc = {
"host": linkdoc["host"],
"domain": linkdoc["domain"],
"created_at": datetime.utcnow(),
"good_document_count":good_document_count,
"document_count":document_count,
"text_size":text_size,
"good_text_size":good_text_size,
"original_text_size":original_text_size,
"batch_size": BATCHSIZE,
"average_fetch_characters": text_size / BATCHSIZE,
}
db["batches"].insert_one(batchdoc)
print(batchdoc)
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib.parse import urllib.parse
@ -292,11 +267,12 @@ def get_bs_links(link,html):
netloc = parsed.netloc netloc = parsed.netloc
path = os.path.normpath(parsed.path) path = os.path.normpath(parsed.path)
scheme = parsed.scheme scheme = parsed.scheme
# internal link
if parsed.netloc == "": if parsed.netloc == "":
scheme = base.scheme scheme = base.scheme
netloc = base.netloc if parsed.path == "/":
if not parsed.path.startswith("/"): netloc = base.netloc
else:
netloc = base.netloc
path = os.path.normpath(base.path +"/" + path) path = os.path.normpath(base.path +"/" + path)
if not scheme.startswith("http"): if not scheme.startswith("http"):
continue continue
@ -328,6 +304,8 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat
continue continue
internal_links, external_links = get_bs_links(final_link,html) internal_links, external_links = get_bs_links(final_link,html)
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE) #external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
for link in external_links:
links[link] = "frontlink"
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE) #internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
#print(extracted_links) #print(extracted_links)
for link in internal_links: for link in internal_links:
@ -569,9 +547,9 @@ def sample_links(db,hostname,status,batch_size):
trainset,testset = split_train(crawled_links) trainset,testset = split_train(crawled_links)
cl.train(trainset) cl.train(trainset)
prediction_accuracy = cl.test(testset) prediction_accuracy = cl.test(testset)
sample_set_size = SAMPLE_SET_SIZE sample_set_size = SAMPLE_SET_SIZE
res = linkcol.find({"host":hostname,"status": status}) res = linkcol.find({"host":hostname,"status": status})
sample_links = []
predicted_good = 0 predicted_good = 0
visitcounter = collections.Counter() visitcounter = collections.Counter()
good_links = [] good_links = []
@ -589,7 +567,7 @@ def sample_links(db,hostname,status,batch_size):
visitcounter[feature] += 1 visitcounter[feature] += 1
mls = int(min(batch_size*(1- DISCOVER_LINK_RATIO),len(good_links))) mls = int(min(batch_size*(1- DISCOVER_LINK_RATIO),len(good_links)))
random.shuffle(good_links) random.shuffle(good_links)
links = list(good_links[0:mls]) links = good_links[0:mls]
numdiscover = len(discover_links) numdiscover = len(discover_links)
eval_discover_links = [] eval_discover_links = []
for link in discover_links: for link in discover_links:
@ -604,7 +582,7 @@ def sample_links(db,hostname,status,batch_size):
#print(eval_discover_links) #print(eval_discover_links)
mls = int(min(batch_size * DISCOVER_LINK_RATIO,len(eval_discover_links))) mls = int(min(batch_size * DISCOVER_LINK_RATIO,len(eval_discover_links)))
links += [l[0] for l in eval_discover_links[0:mls]] links += [l[0] for l in eval_discover_links[0:mls]]
return list(set(links)) return links
def domain_summary(db,hostname): def domain_summary(db,hostname):
linkcol = db["links"] linkcol = db["links"]
@ -635,9 +613,6 @@ def createdb():
domaincol = db["domains"] domaincol = db["domains"]
domaincol.create_index("host",unique=True) domaincol.create_index("host",unique=True)
domaincol.create_index("average_fetch_characters",unique=True) domaincol.create_index("average_fetch_characters",unique=True)
batchcol = db["batches"]
batchcol.create_index("host")
batchcol.create_index("created_at")
def parseurl(link): def parseurl(link):
link,hostname = courlan.check_url(link) link,hostname = courlan.check_url(link)
@ -653,11 +628,6 @@ def parseurl(link):
doc = trafilatura.bare_extraction(html) doc = trafilatura.bare_extraction(html)
import pprint import pprint
pprint.pprint(doc) pprint.pprint(doc)
internal_links, external_links = get_bs_links(link,html)
print(internal_links)
print(external_links)
def externaldomains(link): def externaldomains(link):
html = trafilatura.fetch_url(link,decode=True) html = trafilatura.fetch_url(link,decode=True)
@ -696,10 +666,7 @@ def visit(start_link):
# start crawling # start crawling
# frontlinks first # frontlinks first
links = sample_links(db,hostname,"frontlink",batch_size) links = sample_links(db,hostname,"frontlink",batch_size)
if start_link not in links: links.insert(0,start_link)
links.insert(0,start_link)
print("sampled")
print(links)
# index results # index results
print("Processing links") print("Processing links")
responses = [] responses = []

View File

@ -4,5 +4,4 @@ courlan
pymongo pymongo
click click
lxml lxml
bs4
rq rq