zz
This commit is contained in:
parent
efe2872777
commit
6567de421c
@ -1,4 +1,5 @@
|
|||||||
import pymongo
|
import pymongo
|
||||||
|
import pymongo.errors
|
||||||
import trafilatura
|
import trafilatura
|
||||||
import trafilatura.feeds
|
import trafilatura.feeds
|
||||||
import trafilatura.sitemaps
|
import trafilatura.sitemaps
|
||||||
@ -71,7 +72,6 @@ def calculate_checksums(text):
|
|||||||
def is_robot_good(link,rules):
|
def is_robot_good(link,rules):
|
||||||
# check robots.txt rules
|
# check robots.txt rules
|
||||||
if rules is not None and not rules.can_fetch("*", link):
|
if rules is not None and not rules.can_fetch("*", link):
|
||||||
print("bad>>>" + link)
|
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -203,7 +203,7 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
nd = checkcol.find_one({"_id":chs})
|
nd = checkcol.find_one({"_id":chs})
|
||||||
if nd is not None:
|
if nd is not None:
|
||||||
copysz += paragraph_size
|
copysz += paragraph_size
|
||||||
if copysz / len(text) > TEXT_TRASH_RATIO:
|
if (copysz / len(text)) > TEXT_TRASH_RATIO:
|
||||||
state = "copy"
|
state = "copy"
|
||||||
print(copysz)
|
print(copysz)
|
||||||
if state == "good":
|
if state == "good":
|
||||||
@ -219,7 +219,10 @@ def index_pages(db,hostname,extracted_pages):
|
|||||||
del doc["url"]
|
del doc["url"]
|
||||||
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
|
||||||
for chs in doc["paragraph_checksums"]:
|
for chs in doc["paragraph_checksums"]:
|
||||||
|
try:
|
||||||
checkcol.insert_one({"_id":chs})
|
checkcol.insert_one({"_id":chs})
|
||||||
|
except pymongo.errors.DuplicateKeyError as err:
|
||||||
|
pass
|
||||||
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
|
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
|
||||||
|
|
||||||
|
|
||||||
@ -238,8 +241,6 @@ def extract_links(link_batch,responses,hostname,rules,default_status="frontlink"
|
|||||||
badrobot += 1
|
badrobot += 1
|
||||||
continue
|
continue
|
||||||
status = str(default_status)
|
status = str(default_status)
|
||||||
if courlan.is_navigation_page(link):
|
|
||||||
status = "navigation"
|
|
||||||
#print(link,status)
|
#print(link,status)
|
||||||
links[link] = status
|
links[link] = status
|
||||||
outlinks = []
|
outlinks = []
|
||||||
@ -264,19 +265,27 @@ def index_links(db,extracted_links):
|
|||||||
|
|
||||||
def get_links(db,hostname,status,batch_size):
|
def get_links(db,hostname,status,batch_size):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
|
#res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
|
||||||
links = []
|
res = linkcol.aggregate([
|
||||||
|
{ "$match": { "status": status,"host":hostname } },
|
||||||
|
{ "$sample": { "size": batch_size } }
|
||||||
|
])
|
||||||
|
links = set()
|
||||||
for i,doc in enumerate(res):
|
for i,doc in enumerate(res):
|
||||||
#print(">>>>>" + status)
|
#print(">>>>>" + status)
|
||||||
#print(doc);
|
#print(doc);
|
||||||
print(">>>>links")
|
links.add(doc["url"])
|
||||||
print(doc)
|
|
||||||
links.append(doc["url"])
|
|
||||||
if i >= batch_size:
|
if i >= batch_size:
|
||||||
break
|
break
|
||||||
return links
|
return list(links)
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_sitemap_links(start_link):
|
||||||
|
out = []
|
||||||
|
navigation_links = trafilatura.sitemaps.sitemap_search(start_link,target_lang=LANGUAGE)
|
||||||
|
for link in navigation_links:
|
||||||
|
out.append((link,"frontlink"))
|
||||||
|
return out
|
||||||
|
|
||||||
def process_links(db,hostname,status,links=[],rules=None,batch_size=BATCHSIZE):
|
def process_links(db,hostname,status,links=[],rules=None,batch_size=BATCHSIZE):
|
||||||
#print(links)
|
#print(links)
|
||||||
@ -372,23 +381,24 @@ def visit(start_link):
|
|||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
start_link,hostname = courlan.check_url(start_link)
|
start_link,hostname = courlan.check_url(start_link)
|
||||||
rules = fetch_robot(hostname)
|
|
||||||
print(rules)
|
|
||||||
batch_size = BATCHSIZE
|
batch_size = BATCHSIZE
|
||||||
navigation_links = get_links(db,hostname,"navigation",batch_size)
|
|
||||||
if start_link is not None:
|
print("Getting frontlinks")
|
||||||
navigation_links.append(start_link)
|
|
||||||
print(f"Navigation links {len(navigation_links)}")
|
|
||||||
process_links(db,hostname,"frontlink",navigation_links,rules)
|
|
||||||
links = get_links(db,hostname,"frontlink",batch_size)
|
links = get_links(db,hostname,"frontlink",batch_size)
|
||||||
bl = len(links) - batch_size
|
|
||||||
print(f"Got {len(links)} frontlinks")
|
print(f"Got {len(links)} frontlinks")
|
||||||
if bl > 0:
|
if len(links) < batch_size:
|
||||||
|
print("Fetching sitemap links")
|
||||||
|
sitemap_links = fetch_sitemap_links(start_link)
|
||||||
|
index_links(db,sitemap_links)
|
||||||
|
links.append(start_link)
|
||||||
|
|
||||||
|
print("Processing frontlinks")
|
||||||
|
rules = fetch_robot(hostname)
|
||||||
|
process_links(db,hostname,"frontlink",links,rules)
|
||||||
print("Getting backlinks")
|
print("Getting backlinks")
|
||||||
front_links = get_links(db,hostname,"backlink",bl)
|
back_links = get_links(db,hostname,"backlink",batch_size)
|
||||||
links += front_links
|
|
||||||
print("Processing backlinks")
|
print("Processing backlinks")
|
||||||
process_links(db,hostname,"backlink",links,rules=rules)
|
process_links(db,hostname,"backlink",back_links,rules=rules)
|
||||||
link_summary(db,hostname)
|
link_summary(db,hostname)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
Loading…
Reference in New Issue
Block a user