diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 381dd07..7b4f063 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -94,7 +94,7 @@ def extract_pages(link_batch,htmls): doc = None assert link is not None if html is not None: - doc = trafilatura.bare_extraction(html,url=link,include_links=True,with_metadata=True,include_formatting=True,target_language="sk") + doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language="sk") out.append((link,html,doc)) return out @@ -129,19 +129,14 @@ def index_pages(db,domain,extracted_pages): doc = get_link_doc(link,"backlink") linkcol.insert_one(doc,upsert=True) -def get_front_links(db,domain,batch_size=100): +def get_links(db,domain,status,batch_size=100): linkcol = db["links"] - res = linkcol.find({"status":"frontlink","domain":domain},limit=batch_size) + res = linkcol.find({"status":status,"domain":domain},limit=batch_size) front_links = [] for doc in res: front_links.append(doc["url"]) return filter_links(front_links) -def get_back_links(db,domain,batch_size=100): - linkcol = db["links"] - frontlinks = linkcol.find({"status":"backlink","domain":domain},limit=batch_size) - return front_links - def index_front_links(db,filtered_links): linkcol = db["links"] @@ -165,7 +160,7 @@ def simple_visit(start_link): print("NEW FRONT LINKS") print(new_front_links) index_front_links(db,new_front_links) - visit_links = get_front_links(db,domain) + visit_links = get_links(db,"frontlink",domain) print("NEW VISIT LINKS") print(visit_links) htmls = fetch_pages(visit_links)