From 0d477b1ab353412f579270040c874c06b706d57a Mon Sep 17 00:00:00 2001 From: Daniel Hladek Date: Sat, 25 Mar 2023 13:48:38 +0100 Subject: [PATCH] zz --- mongo/mongocwarler.py | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 0562187..c8e045a 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -92,15 +92,6 @@ def get_link_doc(link,status="frontlink"): domain = courlan.extract_domain(link) return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()} -def generic_visit(domain): - known_links = set(get_visited_links(domain)) - visit_links = [] - visit_links = trafilatura.find_feed_urls(domain) - if visit_links is None: - visit_links = trafilatura.sitemap_search(domain) - if visit_links is None: - visit_links = trafilatura.focused_crawler(dommain,known_links=known_links) - def fetch_pages(link_batch): htmls = [] @@ -118,12 +109,12 @@ def fetch_pages(link_batch): good = False LOGGER.error('not a 200 response: %s for URL %s', response.status, url) elif response.data is None or len(response.data) < MINFILESIZE: - LOGGER.error('too small/incorrect for URL %s', url) + LOGGER.error('too small/incorrect for URL %s', link) good = False # raise error instead? elif len(response.data) > MAXFILESIZE: good = False - LOGGER.error('too large: length %s for URL %s', len(response.data), url) + LOGGER.error('too large: length %s for URL %s', len(response.data), link) if good: html = trafilatura.utils.decode_response(response) final_link = response.url @@ -243,7 +234,6 @@ def index_links(db,extracted_links): except pymongo.errors.DuplicateKeyError as ex: pass - def get_links(db,domain,status,batch_size=BATCHSIZE): linkcol = db["links"] res = linkcol.find({"status":status,"host":domain},{"url":1},limit=batch_size) @@ -306,6 +296,29 @@ def createdb(): htmlcol = db["html"] htmlcol.create_index("url",unique=True) +@cli.command() +@click.argument("link") +def parseurl(link): + html = trafilatura.fetch_url(link,decode=True) + doc = trafilatura.bare_extraction(html) + import pprint + pprint.pprint(doc) + +@cli.command() +@click.argument("link") +def externaldomains(link): + html = trafilatura.fetch_url(link,decode=True) + external_links = courlan.extract_links(html,link,external_bool=True,language=LANGUAGE) + domains = set() + for l in external_links: + r = courlan.check_url(l) + if r is None: + pass + link,domain = r + domains.add(domain) + for d in domains: + print(d) + @cli.command() @click.argument("start_link") def visit(start_link):