From 2f8ead396455ed6deabbc269fb4ac9c2377013f6 Mon Sep 17 00:00:00 2001 From: Dnaiel Hladek Date: Thu, 21 Mar 2024 13:21:43 +0100 Subject: [PATCH] zz --- mongo/cli.py | 21 ++++++++++++++++++--- mongo/mongocrawler.py | 11 +---------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/mongo/cli.py b/mongo/cli.py index 84ed122..50a24ea 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -78,11 +78,26 @@ def fetchlinks(start_link): db=myclient[DBNAME] start_link,hostname = courlan.check_url(start_link) rules = mongocrawler.fetch_robot(hostname) - front_links = mongocrawler.fetch_front_links(start_link,rules) - print(front_links) - mongocrawler.index_links(db,front_links) + links = mongocrawler.fetch_front_links(start_link,rules) + for link in links: + print(link[0]) + #print(front_links) + mongocrawler.index_links(db,links) +@cli.command() +@click.argument(hostname) +def process_links(): + rules = mongocrawler.fetch_robot(hostname) + outfile = "data.jsonl" + links = [] + for line in sys.stdin: + links.append(line.rstrip()) + extracted_pages, extracted_links = fetch_and_extract(links,rules) + for page in extracted_pages: + print(page) + pass + @cli.command(help="Enqueue a list of links into redis queue for crawling") def enqueue(): diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 1c4b91e..ef548dd 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -705,16 +705,7 @@ def visit(hostname,filter_content=True): print(links) # index results print("Processing links") - responses = [] - for link in links: - responses.append(fetch_page(link)) - extracted_pages = [] - for original_link,(final_link,html) in zip(links,responses): - doc = None - assert original_link is not None - doc = extract_page(final_link,html) - extracted_pages.append((original_link,html,doc)) - extracted_links = extract_links(links,responses,rules,"frontlink") + extracted_pages, extracted_links = fetch_and_extract(links,rules) index_pages(db,hostname,extracted_pages,filter_content) index_links(db, extracted_links)