zz
This commit is contained in:
parent
ce8f939980
commit
9d06223012
@ -35,6 +35,7 @@ TEXT_TRASH_RATIO=0.6
|
||||
DISCOVER_LINK_RATIO = 0.3
|
||||
SAMPLE_SET_SIZE =10000
|
||||
CLASSIFIER_SET_SIZE = 200
|
||||
STOP_PATHS=["xml","rss","login","admin"]
|
||||
|
||||
def split_train(res):
|
||||
trainset = []
|
||||
@ -86,6 +87,11 @@ def is_link_good(link):
|
||||
if r is None:
|
||||
return None
|
||||
llink,lhostname = r
|
||||
paths = set(llink.split("/"))
|
||||
for item in STOP_PATHS:
|
||||
if item in paths:
|
||||
return None
|
||||
|
||||
#print(llink,lhostname)
|
||||
# hostname rules
|
||||
if not lhostname.endswith(DOMAIN):
|
||||
@ -268,6 +274,9 @@ def get_bs_links(link,html):
|
||||
continue
|
||||
if path.startswith("/"):
|
||||
path = path[1:]
|
||||
if path.endswith(")"):
|
||||
# javascript
|
||||
continue
|
||||
external = True
|
||||
if parsed.netloc == base.netloc:
|
||||
external = False
|
||||
|
Loading…
Reference in New Issue
Block a user