This commit is contained in:
Daniel Hládek 2023-04-08 10:33:40 +02:00
parent ce8f939980
commit 9d06223012

View File

@ -35,6 +35,7 @@ TEXT_TRASH_RATIO=0.6
DISCOVER_LINK_RATIO = 0.3 DISCOVER_LINK_RATIO = 0.3
SAMPLE_SET_SIZE =10000 SAMPLE_SET_SIZE =10000
CLASSIFIER_SET_SIZE = 200 CLASSIFIER_SET_SIZE = 200
STOP_PATHS=["xml","rss","login","admin"]
def split_train(res): def split_train(res):
trainset = [] trainset = []
@ -86,6 +87,11 @@ def is_link_good(link):
if r is None: if r is None:
return None return None
llink,lhostname = r llink,lhostname = r
paths = set(llink.split("/"))
for item in STOP_PATHS:
if item in paths:
return None
#print(llink,lhostname) #print(llink,lhostname)
# hostname rules # hostname rules
if not lhostname.endswith(DOMAIN): if not lhostname.endswith(DOMAIN):
@ -268,6 +274,9 @@ def get_bs_links(link,html):
continue continue
if path.startswith("/"): if path.startswith("/"):
path = path[1:] path = path[1:]
if path.endswith(")"):
# javascript
continue
external = True external = True
if parsed.netloc == base.netloc: if parsed.netloc == base.netloc:
external = False external = False