zz
This commit is contained in:
parent
8e8d4b9625
commit
44dc4be8c3
@ -21,6 +21,9 @@ import collections
|
|||||||
import math
|
import math
|
||||||
import random
|
import random
|
||||||
import hashlib
|
import hashlib
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import urllib.parse
|
||||||
|
import os.path
|
||||||
|
|
||||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
||||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
||||||
@ -38,6 +41,46 @@ SAMPLE_SET_SIZE =10000
|
|||||||
CLASSIFIER_SET_SIZE = 200
|
CLASSIFIER_SET_SIZE = 200
|
||||||
STOP_PATHS=["xml","rss","login","admin"]
|
STOP_PATHS=["xml","rss","login","admin"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_bs_links(link,html):
|
||||||
|
# Extrakcia linkov zo stranky
|
||||||
|
bs = BeautifulSoup(html, "lxml")
|
||||||
|
base = link
|
||||||
|
if bs.base is not None and "href" in bs.base.attrs:
|
||||||
|
base = bs.base["href"]
|
||||||
|
base = urllib.parse.urlparse(courlan.normalize_url(base))
|
||||||
|
|
||||||
|
links = set()
|
||||||
|
# Normalizacia linkov
|
||||||
|
for l in bs.find_all("a", href=True):
|
||||||
|
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
|
||||||
|
continue
|
||||||
|
href = l["href"]
|
||||||
|
try:
|
||||||
|
parsed = urllib.parse.urlparse(courlan.normalize_url(href))
|
||||||
|
netloc = parsed.netloc
|
||||||
|
path = os.path.normpath(parsed.path)
|
||||||
|
scheme = parsed.scheme
|
||||||
|
# internal link
|
||||||
|
if parsed.netloc == "":
|
||||||
|
scheme = base.scheme
|
||||||
|
netloc = base.netloc
|
||||||
|
if not parsed.path.startswith("/"):
|
||||||
|
path = os.path.normpath(base.path +"/" + path)
|
||||||
|
if not scheme.startswith("http"):
|
||||||
|
continue
|
||||||
|
if path.startswith("/"):
|
||||||
|
path = path[1:]
|
||||||
|
if path.endswith(")"):
|
||||||
|
# javascript
|
||||||
|
continue
|
||||||
|
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
|
||||||
|
href = courlan.normalize_url(href)
|
||||||
|
links.add(href)
|
||||||
|
except ValueError as err:
|
||||||
|
print(err)
|
||||||
|
pass
|
||||||
|
return links
|
||||||
def split_train(res):
|
def split_train(res):
|
||||||
trainset = []
|
trainset = []
|
||||||
testset = []
|
testset = []
|
||||||
@ -243,7 +286,8 @@ def save_batch_info(db,host,states,docs):
|
|||||||
good_document_count = 0
|
good_document_count = 0
|
||||||
original_text_size = 0
|
original_text_size = 0
|
||||||
batch_size = 0
|
batch_size = 0
|
||||||
_,domain = courlan.get_hostinfo(host)
|
d = host.split(".")
|
||||||
|
domain = d[-2] + "." + d[-1]
|
||||||
for state,doc in zip(states,docs):
|
for state,doc in zip(states,docs):
|
||||||
batch_size += 1
|
batch_size += 1
|
||||||
if state == "good":
|
if state == "good":
|
||||||
@ -261,49 +305,6 @@ def save_batch_info(db,host,states,docs):
|
|||||||
db["batches"].insert_one(batchdoc)
|
db["batches"].insert_one(batchdoc)
|
||||||
print(batchdoc)
|
print(batchdoc)
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import urllib.parse
|
|
||||||
import os.path
|
|
||||||
|
|
||||||
def get_bs_links(link,html):
|
|
||||||
# Extrakcia linkov zo stranky
|
|
||||||
bs = BeautifulSoup(html, "lxml")
|
|
||||||
base = link
|
|
||||||
if bs.base is not None and "href" in bs.base.attrs:
|
|
||||||
base = bs.base["href"]
|
|
||||||
base = urllib.parse.urlparse(courlan.normalize_url(base))
|
|
||||||
|
|
||||||
links = set()
|
|
||||||
# Normalizacia linkov
|
|
||||||
for l in bs.find_all("a", href=True):
|
|
||||||
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
|
|
||||||
continue
|
|
||||||
href = l["href"]
|
|
||||||
try:
|
|
||||||
parsed = urllib.parse.urlparse(courlan.normalize_url(href))
|
|
||||||
netloc = parsed.netloc
|
|
||||||
path = os.path.normpath(parsed.path)
|
|
||||||
scheme = parsed.scheme
|
|
||||||
# internal link
|
|
||||||
if parsed.netloc == "":
|
|
||||||
scheme = base.scheme
|
|
||||||
netloc = base.netloc
|
|
||||||
if not parsed.path.startswith("/"):
|
|
||||||
path = os.path.normpath(base.path +"/" + path)
|
|
||||||
if not scheme.startswith("http"):
|
|
||||||
continue
|
|
||||||
if path.startswith("/"):
|
|
||||||
path = path[1:]
|
|
||||||
if path.endswith(")"):
|
|
||||||
# javascript
|
|
||||||
continue
|
|
||||||
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
|
|
||||||
href = courlan.normalize_url(href)
|
|
||||||
links.add(href)
|
|
||||||
except ValueError as err:
|
|
||||||
print(err)
|
|
||||||
pass
|
|
||||||
return links
|
|
||||||
|
|
||||||
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
|
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
|
||||||
links = {}
|
links = {}
|
||||||
@ -721,8 +722,7 @@ def crawl_summary():
|
|||||||
{"$group":{"_id":"$host",
|
{"$group":{"_id":"$host",
|
||||||
"document_count":{"$sum":"$document_count"},
|
"document_count":{"$sum":"$document_count"},
|
||||||
"good_document_count":{"$sum":"$good_document_count"},
|
"good_document_count":{"$sum":"$good_document_count"},
|
||||||
"batch_count":{"$sum":"$batch_size"},
|
"batch_size":{"$sum":"$batch_size"},
|
||||||
"text_size":{"$sum":"$text_size"},
|
|
||||||
"original_text_size":{"$sum":"$original_text_size"},
|
"original_text_size":{"$sum":"$original_text_size"},
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
Loading…
Reference in New Issue
Block a user