zz
This commit is contained in:
parent
c4733776e5
commit
2f8ead3964
21
mongo/cli.py
21
mongo/cli.py
@ -78,11 +78,26 @@ def fetchlinks(start_link):
|
|||||||
db=myclient[DBNAME]
|
db=myclient[DBNAME]
|
||||||
start_link,hostname = courlan.check_url(start_link)
|
start_link,hostname = courlan.check_url(start_link)
|
||||||
rules = mongocrawler.fetch_robot(hostname)
|
rules = mongocrawler.fetch_robot(hostname)
|
||||||
front_links = mongocrawler.fetch_front_links(start_link,rules)
|
links = mongocrawler.fetch_front_links(start_link,rules)
|
||||||
print(front_links)
|
for link in links:
|
||||||
mongocrawler.index_links(db,front_links)
|
print(link[0])
|
||||||
|
#print(front_links)
|
||||||
|
mongocrawler.index_links(db,links)
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument(hostname)
|
||||||
|
def process_links():
|
||||||
|
rules = mongocrawler.fetch_robot(hostname)
|
||||||
|
outfile = "data.jsonl"
|
||||||
|
links = []
|
||||||
|
for line in sys.stdin:
|
||||||
|
links.append(line.rstrip())
|
||||||
|
extracted_pages, extracted_links = fetch_and_extract(links,rules)
|
||||||
|
for page in extracted_pages:
|
||||||
|
print(page)
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
||||||
def enqueue():
|
def enqueue():
|
||||||
|
@ -705,16 +705,7 @@ def visit(hostname,filter_content=True):
|
|||||||
print(links)
|
print(links)
|
||||||
# index results
|
# index results
|
||||||
print("Processing links")
|
print("Processing links")
|
||||||
responses = []
|
extracted_pages, extracted_links = fetch_and_extract(links,rules)
|
||||||
for link in links:
|
|
||||||
responses.append(fetch_page(link))
|
|
||||||
extracted_pages = []
|
|
||||||
for original_link,(final_link,html) in zip(links,responses):
|
|
||||||
doc = None
|
|
||||||
assert original_link is not None
|
|
||||||
doc = extract_page(final_link,html)
|
|
||||||
extracted_pages.append((original_link,html,doc))
|
|
||||||
extracted_links = extract_links(links,responses,rules,"frontlink")
|
|
||||||
|
|
||||||
index_pages(db,hostname,extracted_pages,filter_content)
|
index_pages(db,hostname,extracted_pages,filter_content)
|
||||||
index_links(db, extracted_links)
|
index_links(db, extracted_links)
|
||||||
|
Loading…
Reference in New Issue
Block a user