Compare commits
No commits in common. "725b61d9bbaaa33c21e9f071bf2362eb2e0f9b81" and "000490bf73f4fc5cb8ad2023b260793cc30a576b" have entirely different histories.
725b61d9bb
...
000490bf73
@ -1 +0,0 @@
|
|||||||
docker build . -t dr.kemt.fei.tuke.sk/mongocrawler:dev
|
|
@ -169,7 +169,7 @@ def extract_pages(link_batch:list,responses:list)->list:
|
|||||||
out.append((original_link,final_link,html,doc))
|
out.append((original_link,final_link,html,doc))
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def set_content_checksums(doc):
|
def set_content_checksum(doc):
|
||||||
text = doc["text"]
|
text = doc["text"]
|
||||||
checksums,sizes = calculate_checksums(text)
|
checksums,sizes = calculate_checksums(text)
|
||||||
doc["text_size"] = len(text)
|
doc["text_size"] = len(text)
|
||||||
@ -191,14 +191,7 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
contentcol = db["content"]
|
contentcol = db["content"]
|
||||||
checkcol = db["check"]
|
checkcol = db["check"]
|
||||||
links = []
|
links = []
|
||||||
# stats of the batch
|
|
||||||
good_document_count = 0
|
|
||||||
document_count = 0
|
|
||||||
text_size = 0
|
|
||||||
good_text_size = 0
|
|
||||||
original_text_size = 0
|
|
||||||
for original_link,final_link,html,doc in extracted_pages:
|
for original_link,final_link,html,doc in extracted_pages:
|
||||||
document_count += 1
|
|
||||||
state = "good"
|
state = "good"
|
||||||
link = original_link
|
link = original_link
|
||||||
if original_link != final_link:
|
if original_link != final_link:
|
||||||
@ -212,10 +205,9 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
if doc is not None:
|
if doc is not None:
|
||||||
set_content_checksums(doc)
|
set_content_checksums(doc)
|
||||||
tsz = doc["text_size"]
|
tsz = doc["text_size"]
|
||||||
text_size += tsz
|
|
||||||
psz = doc["paragraph_sizes_sum"]
|
psz = doc["paragraph_sizes_sum"]
|
||||||
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
|
if tsz < TEXT_TRASH_SIZE or psz/tsz < TEXT_TRASH_RATIO:
|
||||||
state = "small"
|
state = "trash"
|
||||||
# check copy
|
# check copy
|
||||||
if state == "good":
|
if state == "good":
|
||||||
origsz = 0
|
origsz = 0
|
||||||
@ -227,11 +219,8 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
|
|
||||||
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
if (1 - (origsz / tsz)) > TEXT_TRASH_RATIO:
|
||||||
state = "copy"
|
state = "copy"
|
||||||
original_text_size += origsz
|
print(copysz)
|
||||||
print(origsz)
|
|
||||||
if state == "good":
|
if state == "good":
|
||||||
good_document_count += 1
|
|
||||||
good_text_size += doc["text_size"]
|
|
||||||
htdoc = get_link_doc(link,state)
|
htdoc = get_link_doc(link,state)
|
||||||
htdoc["html"] = html
|
htdoc["html"] = html
|
||||||
htdoc["html_size"] = len(html)
|
htdoc["html_size"] = len(html)
|
||||||
@ -253,20 +242,6 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
linkdoc = get_link_doc(link,state)
|
linkdoc = get_link_doc(link,state)
|
||||||
del linkdoc["url"]
|
del linkdoc["url"]
|
||||||
linkcol.update_one({"url":link},{"$set":linkdoc})
|
linkcol.update_one({"url":link},{"$set":linkdoc})
|
||||||
batchdoc = {
|
|
||||||
"host": linkdoc["host"],
|
|
||||||
"domain": linkdoc["domain"],
|
|
||||||
"created_at": datetime.utcnow(),
|
|
||||||
"good_document_count":good_document_count,
|
|
||||||
"document_count":document_count,
|
|
||||||
"text_size":text_size,
|
|
||||||
"good_text_size":good_text_size,
|
|
||||||
"original_text_size":original_text_size,
|
|
||||||
"batch_size": BATCHSIZE,
|
|
||||||
"average_fetch_characters": text_size / BATCHSIZE,
|
|
||||||
}
|
|
||||||
db["batches"].insert_one(batchdoc)
|
|
||||||
print(batchdoc)
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
@ -292,11 +267,12 @@ def get_bs_links(link,html):
|
|||||||
netloc = parsed.netloc
|
netloc = parsed.netloc
|
||||||
path = os.path.normpath(parsed.path)
|
path = os.path.normpath(parsed.path)
|
||||||
scheme = parsed.scheme
|
scheme = parsed.scheme
|
||||||
# internal link
|
|
||||||
if parsed.netloc == "":
|
if parsed.netloc == "":
|
||||||
scheme = base.scheme
|
scheme = base.scheme
|
||||||
netloc = base.netloc
|
if parsed.path == "/":
|
||||||
if not parsed.path.startswith("/"):
|
netloc = base.netloc
|
||||||
|
else:
|
||||||
|
netloc = base.netloc
|
||||||
path = os.path.normpath(base.path +"/" + path)
|
path = os.path.normpath(base.path +"/" + path)
|
||||||
if not scheme.startswith("http"):
|
if not scheme.startswith("http"):
|
||||||
continue
|
continue
|
||||||
@ -328,6 +304,8 @@ def extract_links(link_batch:list,responses:list,hostname:str,rules,default_stat
|
|||||||
continue
|
continue
|
||||||
internal_links, external_links = get_bs_links(final_link,html)
|
internal_links, external_links = get_bs_links(final_link,html)
|
||||||
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
#external_links = courlan.extract_links(html,final_link,external_bool=True,language=LANGUAGE)
|
||||||
|
for link in external_links:
|
||||||
|
links[link] = "frontlink"
|
||||||
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
|
#internal_links = courlan.extract_links(html,final_link,external_bool=False,language=LANGUAGE)
|
||||||
#print(extracted_links)
|
#print(extracted_links)
|
||||||
for link in internal_links:
|
for link in internal_links:
|
||||||
@ -569,9 +547,9 @@ def sample_links(db,hostname,status,batch_size):
|
|||||||
trainset,testset = split_train(crawled_links)
|
trainset,testset = split_train(crawled_links)
|
||||||
cl.train(trainset)
|
cl.train(trainset)
|
||||||
prediction_accuracy = cl.test(testset)
|
prediction_accuracy = cl.test(testset)
|
||||||
|
|
||||||
sample_set_size = SAMPLE_SET_SIZE
|
sample_set_size = SAMPLE_SET_SIZE
|
||||||
res = linkcol.find({"host":hostname,"status": status})
|
res = linkcol.find({"host":hostname,"status": status})
|
||||||
|
sample_links = []
|
||||||
predicted_good = 0
|
predicted_good = 0
|
||||||
visitcounter = collections.Counter()
|
visitcounter = collections.Counter()
|
||||||
good_links = []
|
good_links = []
|
||||||
@ -589,7 +567,7 @@ def sample_links(db,hostname,status,batch_size):
|
|||||||
visitcounter[feature] += 1
|
visitcounter[feature] += 1
|
||||||
mls = int(min(batch_size*(1- DISCOVER_LINK_RATIO),len(good_links)))
|
mls = int(min(batch_size*(1- DISCOVER_LINK_RATIO),len(good_links)))
|
||||||
random.shuffle(good_links)
|
random.shuffle(good_links)
|
||||||
links = list(good_links[0:mls])
|
links = good_links[0:mls]
|
||||||
numdiscover = len(discover_links)
|
numdiscover = len(discover_links)
|
||||||
eval_discover_links = []
|
eval_discover_links = []
|
||||||
for link in discover_links:
|
for link in discover_links:
|
||||||
@ -604,7 +582,7 @@ def sample_links(db,hostname,status,batch_size):
|
|||||||
#print(eval_discover_links)
|
#print(eval_discover_links)
|
||||||
mls = int(min(batch_size * DISCOVER_LINK_RATIO,len(eval_discover_links)))
|
mls = int(min(batch_size * DISCOVER_LINK_RATIO,len(eval_discover_links)))
|
||||||
links += [l[0] for l in eval_discover_links[0:mls]]
|
links += [l[0] for l in eval_discover_links[0:mls]]
|
||||||
return list(set(links))
|
return links
|
||||||
|
|
||||||
def domain_summary(db,hostname):
|
def domain_summary(db,hostname):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
@ -635,9 +613,6 @@ def createdb():
|
|||||||
domaincol = db["domains"]
|
domaincol = db["domains"]
|
||||||
domaincol.create_index("host",unique=True)
|
domaincol.create_index("host",unique=True)
|
||||||
domaincol.create_index("average_fetch_characters",unique=True)
|
domaincol.create_index("average_fetch_characters",unique=True)
|
||||||
batchcol = db["batches"]
|
|
||||||
batchcol.create_index("host")
|
|
||||||
batchcol.create_index("created_at")
|
|
||||||
|
|
||||||
def parseurl(link):
|
def parseurl(link):
|
||||||
link,hostname = courlan.check_url(link)
|
link,hostname = courlan.check_url(link)
|
||||||
@ -653,11 +628,6 @@ def parseurl(link):
|
|||||||
doc = trafilatura.bare_extraction(html)
|
doc = trafilatura.bare_extraction(html)
|
||||||
import pprint
|
import pprint
|
||||||
pprint.pprint(doc)
|
pprint.pprint(doc)
|
||||||
internal_links, external_links = get_bs_links(link,html)
|
|
||||||
print(internal_links)
|
|
||||||
print(external_links)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def externaldomains(link):
|
def externaldomains(link):
|
||||||
html = trafilatura.fetch_url(link,decode=True)
|
html = trafilatura.fetch_url(link,decode=True)
|
||||||
@ -696,10 +666,7 @@ def visit(start_link):
|
|||||||
# start crawling
|
# start crawling
|
||||||
# frontlinks first
|
# frontlinks first
|
||||||
links = sample_links(db,hostname,"frontlink",batch_size)
|
links = sample_links(db,hostname,"frontlink",batch_size)
|
||||||
if start_link not in links:
|
links.insert(0,start_link)
|
||||||
links.insert(0,start_link)
|
|
||||||
print("sampled")
|
|
||||||
print(links)
|
|
||||||
# index results
|
# index results
|
||||||
print("Processing links")
|
print("Processing links")
|
||||||
responses = []
|
responses = []
|
||||||
|
@ -4,5 +4,4 @@ courlan
|
|||||||
pymongo
|
pymongo
|
||||||
click
|
click
|
||||||
lxml
|
lxml
|
||||||
bs4
|
|
||||||
rq
|
rq
|
||||||
|
Loading…
Reference in New Issue
Block a user