zz
This commit is contained in:
parent
c1c9e7df41
commit
781325d2a9
@ -94,7 +94,7 @@ def extract_pages(link_batch,htmls):
|
|||||||
doc = None
|
doc = None
|
||||||
assert link is not None
|
assert link is not None
|
||||||
if html is not None:
|
if html is not None:
|
||||||
doc = trafilatura.bare_extraction(html,url=link,include_links=True,with_metadata=True,include_formatting=True,target_language="sk")
|
doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language="sk")
|
||||||
out.append((link,html,doc))
|
out.append((link,html,doc))
|
||||||
return out
|
return out
|
||||||
|
|
||||||
@ -129,19 +129,14 @@ def index_pages(db,domain,extracted_pages):
|
|||||||
doc = get_link_doc(link,"backlink")
|
doc = get_link_doc(link,"backlink")
|
||||||
linkcol.insert_one(doc,upsert=True)
|
linkcol.insert_one(doc,upsert=True)
|
||||||
|
|
||||||
def get_front_links(db,domain,batch_size=100):
|
def get_links(db,domain,status,batch_size=100):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
res = linkcol.find({"status":"frontlink","domain":domain},limit=batch_size)
|
res = linkcol.find({"status":status,"domain":domain},limit=batch_size)
|
||||||
front_links = []
|
front_links = []
|
||||||
for doc in res:
|
for doc in res:
|
||||||
front_links.append(doc["url"])
|
front_links.append(doc["url"])
|
||||||
return filter_links(front_links)
|
return filter_links(front_links)
|
||||||
|
|
||||||
def get_back_links(db,domain,batch_size=100):
|
|
||||||
linkcol = db["links"]
|
|
||||||
frontlinks = linkcol.find({"status":"backlink","domain":domain},limit=batch_size)
|
|
||||||
return front_links
|
|
||||||
|
|
||||||
|
|
||||||
def index_front_links(db,filtered_links):
|
def index_front_links(db,filtered_links):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
@ -165,7 +160,7 @@ def simple_visit(start_link):
|
|||||||
print("NEW FRONT LINKS")
|
print("NEW FRONT LINKS")
|
||||||
print(new_front_links)
|
print(new_front_links)
|
||||||
index_front_links(db,new_front_links)
|
index_front_links(db,new_front_links)
|
||||||
visit_links = get_front_links(db,domain)
|
visit_links = get_links(db,"frontlink",domain)
|
||||||
print("NEW VISIT LINKS")
|
print("NEW VISIT LINKS")
|
||||||
print(visit_links)
|
print(visit_links)
|
||||||
htmls = fetch_pages(visit_links)
|
htmls = fetch_pages(visit_links)
|
||||||
|
Loading…
Reference in New Issue
Block a user