diff --git a/mongo/cli.py b/mongo/cli.py index 84ed122..50a24ea 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -78,11 +78,26 @@ def fetchlinks(start_link): db=myclient[DBNAME] start_link,hostname = courlan.check_url(start_link) rules = mongocrawler.fetch_robot(hostname) - front_links = mongocrawler.fetch_front_links(start_link,rules) - print(front_links) - mongocrawler.index_links(db,front_links) + links = mongocrawler.fetch_front_links(start_link,rules) + for link in links: + print(link[0]) + #print(front_links) + mongocrawler.index_links(db,links) +@cli.command() +@click.argument(hostname) +def process_links(): + rules = mongocrawler.fetch_robot(hostname) + outfile = "data.jsonl" + links = [] + for line in sys.stdin: + links.append(line.rstrip()) + extracted_pages, extracted_links = fetch_and_extract(links,rules) + for page in extracted_pages: + print(page) + pass + @cli.command(help="Enqueue a list of links into redis queue for crawling") def enqueue(): diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 1c4b91e..ef548dd 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -705,16 +705,7 @@ def visit(hostname,filter_content=True): print(links) # index results print("Processing links") - responses = [] - for link in links: - responses.append(fetch_page(link)) - extracted_pages = [] - for original_link,(final_link,html) in zip(links,responses): - doc = None - assert original_link is not None - doc = extract_page(final_link,html) - extracted_pages.append((original_link,html,doc)) - extracted_links = extract_links(links,responses,rules,"frontlink") + extracted_pages, extracted_links = fetch_and_extract(links,rules) index_pages(db,hostname,extracted_pages,filter_content) index_links(db, extracted_links)