zz
This commit is contained in:
parent
8e8d4b9625
commit
44dc4be8c3
@ -21,6 +21,9 @@ import collections
|
||||
import math
|
||||
import random
|
||||
import hashlib
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib.parse
|
||||
import os.path
|
||||
|
||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
||||
@ -38,6 +41,46 @@ SAMPLE_SET_SIZE =10000
|
||||
CLASSIFIER_SET_SIZE = 200
|
||||
STOP_PATHS=["xml","rss","login","admin"]
|
||||
|
||||
|
||||
def get_bs_links(link,html):
|
||||
# Extrakcia linkov zo stranky
|
||||
bs = BeautifulSoup(html, "lxml")
|
||||
base = link
|
||||
if bs.base is not None and "href" in bs.base.attrs:
|
||||
base = bs.base["href"]
|
||||
base = urllib.parse.urlparse(courlan.normalize_url(base))
|
||||
|
||||
links = set()
|
||||
# Normalizacia linkov
|
||||
for l in bs.find_all("a", href=True):
|
||||
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
|
||||
continue
|
||||
href = l["href"]
|
||||
try:
|
||||
parsed = urllib.parse.urlparse(courlan.normalize_url(href))
|
||||
netloc = parsed.netloc
|
||||
path = os.path.normpath(parsed.path)
|
||||
scheme = parsed.scheme
|
||||
# internal link
|
||||
if parsed.netloc == "":
|
||||
scheme = base.scheme
|
||||
netloc = base.netloc
|
||||
if not parsed.path.startswith("/"):
|
||||
path = os.path.normpath(base.path +"/" + path)
|
||||
if not scheme.startswith("http"):
|
||||
continue
|
||||
if path.startswith("/"):
|
||||
path = path[1:]
|
||||
if path.endswith(")"):
|
||||
# javascript
|
||||
continue
|
||||
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
|
||||
href = courlan.normalize_url(href)
|
||||
links.add(href)
|
||||
except ValueError as err:
|
||||
print(err)
|
||||
pass
|
||||
return links
|
||||
def split_train(res):
|
||||
trainset = []
|
||||
testset = []
|
||||
@ -243,7 +286,8 @@ def save_batch_info(db,host,states,docs):
|
||||
good_document_count = 0
|
||||
original_text_size = 0
|
||||
batch_size = 0
|
||||
_,domain = courlan.get_hostinfo(host)
|
||||
d = host.split(".")
|
||||
domain = d[-2] + "." + d[-1]
|
||||
for state,doc in zip(states,docs):
|
||||
batch_size += 1
|
||||
if state == "good":
|
||||
@ -261,49 +305,6 @@ def save_batch_info(db,host,states,docs):
|
||||
db["batches"].insert_one(batchdoc)
|
||||
print(batchdoc)
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib.parse
|
||||
import os.path
|
||||
|
||||
def get_bs_links(link,html):
|
||||
# Extrakcia linkov zo stranky
|
||||
bs = BeautifulSoup(html, "lxml")
|
||||
base = link
|
||||
if bs.base is not None and "href" in bs.base.attrs:
|
||||
base = bs.base["href"]
|
||||
base = urllib.parse.urlparse(courlan.normalize_url(base))
|
||||
|
||||
links = set()
|
||||
# Normalizacia linkov
|
||||
for l in bs.find_all("a", href=True):
|
||||
if "rel" in l.attrs and l.attrs["rel"] == "nofollow" or "nofollow" in l.attrs:
|
||||
continue
|
||||
href = l["href"]
|
||||
try:
|
||||
parsed = urllib.parse.urlparse(courlan.normalize_url(href))
|
||||
netloc = parsed.netloc
|
||||
path = os.path.normpath(parsed.path)
|
||||
scheme = parsed.scheme
|
||||
# internal link
|
||||
if parsed.netloc == "":
|
||||
scheme = base.scheme
|
||||
netloc = base.netloc
|
||||
if not parsed.path.startswith("/"):
|
||||
path = os.path.normpath(base.path +"/" + path)
|
||||
if not scheme.startswith("http"):
|
||||
continue
|
||||
if path.startswith("/"):
|
||||
path = path[1:]
|
||||
if path.endswith(")"):
|
||||
# javascript
|
||||
continue
|
||||
href = urllib.parse.urlunparse((scheme,netloc,path,"","",""))
|
||||
href = courlan.normalize_url(href)
|
||||
links.add(href)
|
||||
except ValueError as err:
|
||||
print(err)
|
||||
pass
|
||||
return links
|
||||
|
||||
def extract_links(link_batch:list,responses:list,hostname:str,rules,default_status="frontlink")->list:
|
||||
links = {}
|
||||
@ -721,8 +722,7 @@ def crawl_summary():
|
||||
{"$group":{"_id":"$host",
|
||||
"document_count":{"$sum":"$document_count"},
|
||||
"good_document_count":{"$sum":"$good_document_count"},
|
||||
"batch_count":{"$sum":"$batch_size"},
|
||||
"text_size":{"$sum":"$text_size"},
|
||||
"batch_size":{"$sum":"$batch_size"},
|
||||
"original_text_size":{"$sum":"$original_text_size"},
|
||||
}
|
||||
},
|
||||
|
Loading…
Reference in New Issue
Block a user