zz
This commit is contained in:
parent
ce8f939980
commit
9d06223012
@ -35,6 +35,7 @@ TEXT_TRASH_RATIO=0.6
|
|||||||
DISCOVER_LINK_RATIO = 0.3
|
DISCOVER_LINK_RATIO = 0.3
|
||||||
SAMPLE_SET_SIZE =10000
|
SAMPLE_SET_SIZE =10000
|
||||||
CLASSIFIER_SET_SIZE = 200
|
CLASSIFIER_SET_SIZE = 200
|
||||||
|
STOP_PATHS=["xml","rss","login","admin"]
|
||||||
|
|
||||||
def split_train(res):
|
def split_train(res):
|
||||||
trainset = []
|
trainset = []
|
||||||
@ -86,6 +87,11 @@ def is_link_good(link):
|
|||||||
if r is None:
|
if r is None:
|
||||||
return None
|
return None
|
||||||
llink,lhostname = r
|
llink,lhostname = r
|
||||||
|
paths = set(llink.split("/"))
|
||||||
|
for item in STOP_PATHS:
|
||||||
|
if item in paths:
|
||||||
|
return None
|
||||||
|
|
||||||
#print(llink,lhostname)
|
#print(llink,lhostname)
|
||||||
# hostname rules
|
# hostname rules
|
||||||
if not lhostname.endswith(DOMAIN):
|
if not lhostname.endswith(DOMAIN):
|
||||||
@ -268,6 +274,9 @@ def get_bs_links(link,html):
|
|||||||
continue
|
continue
|
||||||
if path.startswith("/"):
|
if path.startswith("/"):
|
||||||
path = path[1:]
|
path = path[1:]
|
||||||
|
if path.endswith(")"):
|
||||||
|
# javascript
|
||||||
|
continue
|
||||||
external = True
|
external = True
|
||||||
if parsed.netloc == base.netloc:
|
if parsed.netloc == base.netloc:
|
||||||
external = False
|
external = False
|
||||||
|
Loading…
Reference in New Issue
Block a user