zz
This commit is contained in:
		
							parent
							
								
									ce8f939980
								
							
						
					
					
						commit
						9d06223012
					
				@ -35,6 +35,7 @@ TEXT_TRASH_RATIO=0.6
 | 
			
		||||
DISCOVER_LINK_RATIO = 0.3
 | 
			
		||||
SAMPLE_SET_SIZE =10000
 | 
			
		||||
CLASSIFIER_SET_SIZE = 200
 | 
			
		||||
STOP_PATHS=["xml","rss","login","admin"]
 | 
			
		||||
 | 
			
		||||
def split_train(res):
 | 
			
		||||
    trainset = []
 | 
			
		||||
@ -86,6 +87,11 @@ def is_link_good(link):
 | 
			
		||||
    if r is None:
 | 
			
		||||
        return None
 | 
			
		||||
    llink,lhostname = r
 | 
			
		||||
    paths = set(llink.split("/"))
 | 
			
		||||
    for item in STOP_PATHS:
 | 
			
		||||
        if item in paths:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
    #print(llink,lhostname)
 | 
			
		||||
    # hostname rules
 | 
			
		||||
    if not lhostname.endswith(DOMAIN):
 | 
			
		||||
@ -268,6 +274,9 @@ def get_bs_links(link,html):
 | 
			
		||||
                continue
 | 
			
		||||
            if path.startswith("/"):
 | 
			
		||||
                path = path[1:]
 | 
			
		||||
            if path.endswith(")"):
 | 
			
		||||
                # javascript
 | 
			
		||||
                continue
 | 
			
		||||
            external = True
 | 
			
		||||
            if parsed.netloc == base.netloc:
 | 
			
		||||
                external = False
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user