This commit is contained in:
Daniel Hládek 2023-04-01 20:44:37 +02:00
parent efe2872777
commit 6567de421c

View File

@ -1,4 +1,5 @@
import pymongo
import pymongo.errors
import trafilatura
import trafilatura.feeds
import trafilatura.sitemaps
@ -71,7 +72,6 @@ def calculate_checksums(text):
def is_robot_good(link,rules):
# check robots.txt rules
if rules is not None and not rules.can_fetch("*", link):
print("bad>>>" + link)
return False
return True
@ -203,7 +203,7 @@ def index_pages(db,hostname,extracted_pages):
nd = checkcol.find_one({"_id":chs})
if nd is not None:
copysz += paragraph_size
if copysz / len(text) > TEXT_TRASH_RATIO:
if (copysz / len(text)) > TEXT_TRASH_RATIO:
state = "copy"
print(copysz)
if state == "good":
@ -219,7 +219,10 @@ def index_pages(db,hostname,extracted_pages):
del doc["url"]
contentcol.update_one({"url":link},{"$set":doc},upsert=True)
for chs in doc["paragraph_checksums"]:
checkcol.insert_one({"_id":chs})
try:
checkcol.insert_one({"_id":chs})
except pymongo.errors.DuplicateKeyError as err:
pass
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
@ -238,8 +241,6 @@ def extract_links(link_batch,responses,hostname,rules,default_status="frontlink"
badrobot += 1
continue
status = str(default_status)
if courlan.is_navigation_page(link):
status = "navigation"
#print(link,status)
links[link] = status
outlinks = []
@ -264,19 +265,27 @@ def index_links(db,extracted_links):
def get_links(db,hostname,status,batch_size):
linkcol = db["links"]
res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
links = []
#res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
res = linkcol.aggregate([
{ "$match": { "status": status,"host":hostname } },
{ "$sample": { "size": batch_size } }
])
links = set()
for i,doc in enumerate(res):
#print(">>>>>" + status)
#print(doc);
print(">>>>links")
print(doc)
links.append(doc["url"])
links.add(doc["url"])
if i >= batch_size:
break
return links
return list(links)
def fetch_sitemap_links(start_link):
out = []
navigation_links = trafilatura.sitemaps.sitemap_search(start_link,target_lang=LANGUAGE)
for link in navigation_links:
out.append((link,"frontlink"))
return out
def process_links(db,hostname,status,links=[],rules=None,batch_size=BATCHSIZE):
#print(links)
@ -372,23 +381,24 @@ def visit(start_link):
myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME]
start_link,hostname = courlan.check_url(start_link)
rules = fetch_robot(hostname)
print(rules)
batch_size = BATCHSIZE
navigation_links = get_links(db,hostname,"navigation",batch_size)
if start_link is not None:
navigation_links.append(start_link)
print(f"Navigation links {len(navigation_links)}")
process_links(db,hostname,"frontlink",navigation_links,rules)
print("Getting frontlinks")
links = get_links(db,hostname,"frontlink",batch_size)
bl = len(links) - batch_size
print(f"Got {len(links)} frontlinks")
if bl > 0:
print("Getting backlinks")
front_links = get_links(db,hostname,"backlink",bl)
links += front_links
if len(links) < batch_size:
print("Fetching sitemap links")
sitemap_links = fetch_sitemap_links(start_link)
index_links(db,sitemap_links)
links.append(start_link)
print("Processing frontlinks")
rules = fetch_robot(hostname)
process_links(db,hostname,"frontlink",links,rules)
print("Getting backlinks")
back_links = get_links(db,hostname,"backlink",batch_size)
print("Processing backlinks")
process_links(db,hostname,"backlink",links,rules=rules)
process_links(db,hostname,"backlink",back_links,rules=rules)
link_summary(db,hostname)
if __name__ == "__main__":