zz
This commit is contained in:
		
							parent
							
								
									6567de421c
								
							
						
					
					
						commit
						ab7ca1476f
					
				@ -184,7 +184,6 @@ def index_pages(db,hostname,extracted_pages):
 | 
				
			|||||||
            doc["paragraph_checksums"] = checksums
 | 
					            doc["paragraph_checksums"] = checksums
 | 
				
			||||||
            doc["paragraph_sizes"] = sizes
 | 
					            doc["paragraph_sizes"] = sizes
 | 
				
			||||||
            goodsz = sum(sizes)
 | 
					            goodsz = sum(sizes)
 | 
				
			||||||
            doc["paragraph_sizes_sum"] = goodsz
 | 
					 | 
				
			||||||
            # Not enough larger paragraphs
 | 
					            # Not enough larger paragraphs
 | 
				
			||||||
            if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
 | 
					            if len(text) < TEXT_TRASH_SIZE or goodsz/len(text) < TEXT_TRASH_RATIO:
 | 
				
			||||||
                state = "trash"
 | 
					                state = "trash"
 | 
				
			||||||
@ -263,9 +262,54 @@ def index_links(db,extracted_links):
 | 
				
			|||||||
        except pymongo.errors.DuplicateKeyError as ex:
 | 
					        except pymongo.errors.DuplicateKeyError as ex:
 | 
				
			||||||
            pass
 | 
					            pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_link_features(link):
 | 
				
			||||||
 | 
					    a, urlpath = courlan.get_host_and_path(link)
 | 
				
			||||||
 | 
					    features = urlpath.split("/?-_")
 | 
				
			||||||
 | 
					    if len(features) < 2:
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
 | 
					    # drop last part
 | 
				
			||||||
 | 
					    features = features[:-1]
 | 
				
			||||||
 | 
					    return features
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def link_classifier(db,hostname,batch_size):
 | 
				
			||||||
 | 
					    res = linkcol.aggregate([
 | 
				
			||||||
 | 
					        { "$match": { "status": {"$not":{"$in":["frontlink","backlink"]}},"host":hostname } },
 | 
				
			||||||
 | 
					        { "$sample": { "size": 2000 } }
 | 
				
			||||||
 | 
					    ])
 | 
				
			||||||
 | 
					    goodcounter = collections.Counter()
 | 
				
			||||||
 | 
					    badcounter = collections.Counter()
 | 
				
			||||||
 | 
					    for item in res:
 | 
				
			||||||
 | 
					        link = res["url"]
 | 
				
			||||||
 | 
					        state = res["status"]
 | 
				
			||||||
 | 
					        cl = 0
 | 
				
			||||||
 | 
					        if state == "good":
 | 
				
			||||||
 | 
					            cl = 1
 | 
				
			||||||
 | 
					        features = get_link_features(link)
 | 
				
			||||||
 | 
					        if features is None:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        lf = len(features)
 | 
				
			||||||
 | 
					        for feature in features:
 | 
				
			||||||
 | 
					            if state == "good":
 | 
				
			||||||
 | 
					                goodcounter[feature] += 1/lf
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                badcounter[feature] += 1/lf
 | 
				
			||||||
 | 
					        tf = goodcounter.keys() + bacounter.keys()
 | 
				
			||||||
 | 
					        allcounter = collections.Counter()
 | 
				
			||||||
 | 
					        for key in tf:
 | 
				
			||||||
 | 
					            gc = goodcounter[key]
 | 
				
			||||||
 | 
					            bc = badcounter[key]
 | 
				
			||||||
 | 
					            p = gc / (gc + bc)
 | 
				
			||||||
 | 
					            allcounter[key] = p
 | 
				
			||||||
 | 
					        return allcounter
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_links(db,hostname,status,batch_size):
 | 
					def get_links(db,hostname,status,batch_size):
 | 
				
			||||||
    linkcol = db["links"]
 | 
					    linkcol = db["links"]
 | 
				
			||||||
    #res  = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
 | 
					    #res  = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
 | 
				
			||||||
 | 
					    # get random links
 | 
				
			||||||
    res = linkcol.aggregate([
 | 
					    res = linkcol.aggregate([
 | 
				
			||||||
        { "$match": { "status": status,"host":hostname } },
 | 
					        { "$match": { "status": status,"host":hostname } },
 | 
				
			||||||
        { "$sample": { "size": batch_size } }
 | 
					        { "$sample": { "size": batch_size } }
 | 
				
			||||||
@ -317,13 +361,22 @@ def link_summary(db,hostname):
 | 
				
			|||||||
        #{"$project": {"textsum":{"$sum":"$text_size"}}}
 | 
					        #{"$project": {"textsum":{"$sum":"$text_size"}}}
 | 
				
			||||||
        {"$group":{"_id":None,
 | 
					        {"$group":{"_id":None,
 | 
				
			||||||
                   "text_size_sum":{"$sum":"$text_size"},
 | 
					                   "text_size_sum":{"$sum":"$text_size"},
 | 
				
			||||||
                   "paragraph_size_sum":{"$sum":"$paragraph_sizes_sum"}
 | 
					 | 
				
			||||||
                   }
 | 
					                   }
 | 
				
			||||||
         },
 | 
					         },
 | 
				
			||||||
    ])
 | 
					    ])
 | 
				
			||||||
    for item in res:
 | 
					    for item in res:
 | 
				
			||||||
        print(item)
 | 
					        print(item)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def domain_summary(db,hostname):
 | 
				
			||||||
 | 
					    linkcol = db["links"]
 | 
				
			||||||
 | 
					    #res = linkcol.distinct("hostname",{"hostname":hostname})
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # count links
 | 
				
			||||||
 | 
					    res = linkcol.aggregate([
 | 
				
			||||||
 | 
					        {"$group":{"_id":"$hostname","text_size_sum":{"$sum":"$text_size"}}},
 | 
				
			||||||
 | 
					    ])
 | 
				
			||||||
 | 
					    for item in res:
 | 
				
			||||||
 | 
					        print(item)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@click.group()
 | 
					@click.group()
 | 
				
			||||||
def cli():
 | 
					def cli():
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user