This commit is contained in:
Dnaiel Hladek 2024-03-21 13:21:43 +01:00
parent c4733776e5
commit 2f8ead3964
2 changed files with 19 additions and 13 deletions

View File

@ -78,11 +78,26 @@ def fetchlinks(start_link):
db=myclient[DBNAME]
start_link,hostname = courlan.check_url(start_link)
rules = mongocrawler.fetch_robot(hostname)
front_links = mongocrawler.fetch_front_links(start_link,rules)
print(front_links)
mongocrawler.index_links(db,front_links)
links = mongocrawler.fetch_front_links(start_link,rules)
for link in links:
print(link[0])
#print(front_links)
mongocrawler.index_links(db,links)
@cli.command()
@click.argument(hostname)
def process_links():
rules = mongocrawler.fetch_robot(hostname)
outfile = "data.jsonl"
links = []
for line in sys.stdin:
links.append(line.rstrip())
extracted_pages, extracted_links = fetch_and_extract(links,rules)
for page in extracted_pages:
print(page)
pass
@cli.command(help="Enqueue a list of links into redis queue for crawling")
def enqueue():

View File

@ -705,16 +705,7 @@ def visit(hostname,filter_content=True):
print(links)
# index results
print("Processing links")
responses = []
for link in links:
responses.append(fetch_page(link))
extracted_pages = []
for original_link,(final_link,html) in zip(links,responses):
doc = None
assert original_link is not None
doc = extract_page(final_link,html)
extracted_pages.append((original_link,html,doc))
extracted_links = extract_links(links,responses,rules,"frontlink")
extracted_pages, extracted_links = fetch_and_extract(links,rules)
index_pages(db,hostname,extracted_pages,filter_content)
index_links(db, extracted_links)