This commit is contained in:
Daniel Hládek 2024-03-21 19:36:59 +01:00
parent 5d45569651
commit 5b887a13c7
2 changed files with 24 additions and 4 deletions

View File

@ -91,15 +91,36 @@ def fetchlinks(start_link):
@click.argument("hostname")
def processlinks(hostname):
rules = mongocrawler.fetch_robot(hostname)
outfile = "data.jsonl"
dname = "data"
outfile = dname + "/data.jsonl"
loutfile = dname + "/extracted.links"
htmldir = dname + "/html/"
links = []
os.mkdir(dname)
os.mkdir(htmldir)
for line in sys.stdin:
links.append(line.rstrip())
extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
# save extracted text
with open(outfile,"w") as of:
for page in extracted_pages:
doc = json.dumps(page)
print(page,file=of)
url,html,doc = page
if "url" in doc and doc["url"] != url:
doc["original_url"] = url
else:
doc["url"] = url
import urllib.parse
hname = htmldir + urllib.parse.quote(url,safe="")
doc["html_filename"] = hname
with open(hname,"w") as hf:
print(html,file=hf)
ddoc = json.dumps(doc)
print(ddoc,file=of)
# save extracted links
with open(loutfile,"w") as of:
for link in links:
print(link,file=of)
@cli.command(help="Enqueue a list of links into redis queue for crawling")

View File

@ -673,7 +673,6 @@ def index_pages(db,hostname,extracted_pages,filter_content):
save_batch_info(db,hostname,final_states,docs)
def fetch_and_extract(links,rules):
print("Processing links")
responses = []
for link in links:
responses.append(fetch_page(link))