This commit is contained in:
Daniel Hládek 2023-03-08 11:22:33 +01:00
parent c1c9e7df41
commit 781325d2a9

View File

@ -94,7 +94,7 @@ def extract_pages(link_batch,htmls):
doc = None doc = None
assert link is not None assert link is not None
if html is not None: if html is not None:
doc = trafilatura.bare_extraction(html,url=link,include_links=True,with_metadata=True,include_formatting=True,target_language="sk") doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language="sk")
out.append((link,html,doc)) out.append((link,html,doc))
return out return out
@ -129,19 +129,14 @@ def index_pages(db,domain,extracted_pages):
doc = get_link_doc(link,"backlink") doc = get_link_doc(link,"backlink")
linkcol.insert_one(doc,upsert=True) linkcol.insert_one(doc,upsert=True)
def get_front_links(db,domain,batch_size=100): def get_links(db,domain,status,batch_size=100):
linkcol = db["links"] linkcol = db["links"]
res = linkcol.find({"status":"frontlink","domain":domain},limit=batch_size) res = linkcol.find({"status":status,"domain":domain},limit=batch_size)
front_links = [] front_links = []
for doc in res: for doc in res:
front_links.append(doc["url"]) front_links.append(doc["url"])
return filter_links(front_links) return filter_links(front_links)
def get_back_links(db,domain,batch_size=100):
linkcol = db["links"]
frontlinks = linkcol.find({"status":"backlink","domain":domain},limit=batch_size)
return front_links
def index_front_links(db,filtered_links): def index_front_links(db,filtered_links):
linkcol = db["links"] linkcol = db["links"]
@ -165,7 +160,7 @@ def simple_visit(start_link):
print("NEW FRONT LINKS") print("NEW FRONT LINKS")
print(new_front_links) print(new_front_links)
index_front_links(db,new_front_links) index_front_links(db,new_front_links)
visit_links = get_front_links(db,domain) visit_links = get_links(db,"frontlink",domain)
print("NEW VISIT LINKS") print("NEW VISIT LINKS")
print(visit_links) print(visit_links)
htmls = fetch_pages(visit_links) htmls = fetch_pages(visit_links)