zz
This commit is contained in:
parent
c1c9e7df41
commit
781325d2a9
@ -94,7 +94,7 @@ def extract_pages(link_batch,htmls):
|
||||
doc = None
|
||||
assert link is not None
|
||||
if html is not None:
|
||||
doc = trafilatura.bare_extraction(html,url=link,include_links=True,with_metadata=True,include_formatting=True,target_language="sk")
|
||||
doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language="sk")
|
||||
out.append((link,html,doc))
|
||||
return out
|
||||
|
||||
@ -129,19 +129,14 @@ def index_pages(db,domain,extracted_pages):
|
||||
doc = get_link_doc(link,"backlink")
|
||||
linkcol.insert_one(doc,upsert=True)
|
||||
|
||||
def get_front_links(db,domain,batch_size=100):
|
||||
def get_links(db,domain,status,batch_size=100):
|
||||
linkcol = db["links"]
|
||||
res = linkcol.find({"status":"frontlink","domain":domain},limit=batch_size)
|
||||
res = linkcol.find({"status":status,"domain":domain},limit=batch_size)
|
||||
front_links = []
|
||||
for doc in res:
|
||||
front_links.append(doc["url"])
|
||||
return filter_links(front_links)
|
||||
|
||||
def get_back_links(db,domain,batch_size=100):
|
||||
linkcol = db["links"]
|
||||
frontlinks = linkcol.find({"status":"backlink","domain":domain},limit=batch_size)
|
||||
return front_links
|
||||
|
||||
|
||||
def index_front_links(db,filtered_links):
|
||||
linkcol = db["links"]
|
||||
@ -165,7 +160,7 @@ def simple_visit(start_link):
|
||||
print("NEW FRONT LINKS")
|
||||
print(new_front_links)
|
||||
index_front_links(db,new_front_links)
|
||||
visit_links = get_front_links(db,domain)
|
||||
visit_links = get_links(db,"frontlink",domain)
|
||||
print("NEW VISIT LINKS")
|
||||
print(visit_links)
|
||||
htmls = fetch_pages(visit_links)
|
||||
|
Loading…
Reference in New Issue
Block a user