zz
This commit is contained in:
parent
efe2872777
commit
6567de421c
@ -1,4 +1,5 @@
|
||||
import pymongo
|
||||
import pymongo.errors
|
||||
import trafilatura
|
||||
import trafilatura.feeds
|
||||
import trafilatura.sitemaps
|
||||
@ -71,7 +72,6 @@ def calculate_checksums(text):
|
||||
def is_robot_good(link,rules):
|
||||
# check robots.txt rules
|
||||
if rules is not None and not rules.can_fetch("*", link):
|
||||
print("bad>>>" + link)
|
||||
return False
|
||||
return True
|
||||
|
||||
@ -203,7 +203,7 @@ def index_pages(db,hostname,extracted_pages):
|
||||
nd = checkcol.find_one({"_id":chs})
|
||||
if nd is not None:
|
||||
copysz += paragraph_size
|
||||
if copysz / len(text) > TEXT_TRASH_RATIO:
|
||||
if (copysz / len(text)) > TEXT_TRASH_RATIO:
|
||||
state = "copy"
|
||||
print(copysz)
|
||||
if state == "good":
|
||||
@ -219,7 +219,10 @@ def index_pages(db,hostname,extracted_pages):
|
||||
del doc["url"]
|
||||
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
||||
for chs in doc["paragraph_checksums"]:
|
||||
checkcol.insert_one({"_id":chs})
|
||||
try:
|
||||
checkcol.insert_one({"_id":chs})
|
||||
except pymongo.errors.DuplicateKeyError as err:
|
||||
pass
|
||||
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
|
||||
|
||||
|
||||
@ -238,8 +241,6 @@ def extract_links(link_batch,responses,hostname,rules,default_status="frontlink"
|
||||
badrobot += 1
|
||||
continue
|
||||
status = str(default_status)
|
||||
if courlan.is_navigation_page(link):
|
||||
status = "navigation"
|
||||
#print(link,status)
|
||||
links[link] = status
|
||||
outlinks = []
|
||||
@ -264,19 +265,27 @@ def index_links(db,extracted_links):
|
||||
|
||||
def get_links(db,hostname,status,batch_size):
|
||||
linkcol = db["links"]
|
||||
res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
|
||||
links = []
|
||||
#res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
|
||||
res = linkcol.aggregate([
|
||||
{ "$match": { "status": status,"host":hostname } },
|
||||
{ "$sample": { "size": batch_size } }
|
||||
])
|
||||
links = set()
|
||||
for i,doc in enumerate(res):
|
||||
#print(">>>>>" + status)
|
||||
#print(doc);
|
||||
print(">>>>links")
|
||||
print(doc)
|
||||
links.append(doc["url"])
|
||||
links.add(doc["url"])
|
||||
if i >= batch_size:
|
||||
break
|
||||
return links
|
||||
return list(links)
|
||||
|
||||
|
||||
def fetch_sitemap_links(start_link):
|
||||
out = []
|
||||
navigation_links = trafilatura.sitemaps.sitemap_search(start_link,target_lang=LANGUAGE)
|
||||
for link in navigation_links:
|
||||
out.append((link,"frontlink"))
|
||||
return out
|
||||
|
||||
def process_links(db,hostname,status,links=[],rules=None,batch_size=BATCHSIZE):
|
||||
#print(links)
|
||||
@ -372,23 +381,24 @@ def visit(start_link):
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
start_link,hostname = courlan.check_url(start_link)
|
||||
rules = fetch_robot(hostname)
|
||||
print(rules)
|
||||
batch_size = BATCHSIZE
|
||||
navigation_links = get_links(db,hostname,"navigation",batch_size)
|
||||
if start_link is not None:
|
||||
navigation_links.append(start_link)
|
||||
print(f"Navigation links {len(navigation_links)}")
|
||||
process_links(db,hostname,"frontlink",navigation_links,rules)
|
||||
|
||||
print("Getting frontlinks")
|
||||
links = get_links(db,hostname,"frontlink",batch_size)
|
||||
bl = len(links) - batch_size
|
||||
print(f"Got {len(links)} frontlinks")
|
||||
if bl > 0:
|
||||
print("Getting backlinks")
|
||||
front_links = get_links(db,hostname,"backlink",bl)
|
||||
links += front_links
|
||||
if len(links) < batch_size:
|
||||
print("Fetching sitemap links")
|
||||
sitemap_links = fetch_sitemap_links(start_link)
|
||||
index_links(db,sitemap_links)
|
||||
links.append(start_link)
|
||||
|
||||
print("Processing frontlinks")
|
||||
rules = fetch_robot(hostname)
|
||||
process_links(db,hostname,"frontlink",links,rules)
|
||||
print("Getting backlinks")
|
||||
back_links = get_links(db,hostname,"backlink",batch_size)
|
||||
print("Processing backlinks")
|
||||
process_links(db,hostname,"backlink",links,rules=rules)
|
||||
process_links(db,hostname,"backlink",back_links,rules=rules)
|
||||
link_summary(db,hostname)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Loading…
Reference in New Issue
Block a user