merge

zz
2023-03-25 13:50:12 +01:00 · 2023-03-25 13:48:38 +01:00
1 changed files with 25 additions and 12 deletions
--- a/mongo/mongocwarler.py
+++ b/mongo/mongocwarler.py
@ -93,15 +93,6 @@ def get_link_doc(link,status="frontlink"):
    domain = courlan.extract_domain(link)
    return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()}

-def generic_visit(hostname):
-    known_links = set(get_visited_links(hostname))
-    visit_links = []
-    visit_links = trafilatura.find_feed_urls(hostname)
-    if visit_links is None:
-        visit_links = trafilatura.sitemap_search(hostname)
-    if visit_links is None:
-        visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
-

 def fetch_pages(link_batch):
    htmls  = []
@ -119,12 +110,12 @@ def fetch_pages(link_batch):
                good = False
                LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
            elif response.data is None or len(response.data) < MINFILESIZE:
-                LOGGER.error('too small/incorrect for URL %s', url)
+                LOGGER.error('too small/incorrect for URL %s', link)
                good = False
            # raise error instead?
            elif len(response.data) > MAXFILESIZE:
                good = False
-                LOGGER.error('too large: length %s for URL %s', len(response.data), url)
+                LOGGER.error('too large: length %s for URL %s', len(response.data), link)
            if good:
                html = trafilatura.utils.decode_response(response) 
                final_link = response.url
@ -247,7 +238,6 @@ def index_links(db,extracted_links):
        except pymongo.errors.DuplicateKeyError as ex:
            pass

-
 def get_links(db,hostname,status,batch_size=BATCHSIZE):
    linkcol = db["links"]
    res  = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
@ -310,6 +300,29 @@ def createdb():
    htmlcol = db["html"]
    htmlcol.create_index("url",unique=True)

+@cli.command()
+@click.argument("link")
+def parseurl(link):
+    html = trafilatura.fetch_url(link,decode=True)
+    doc = trafilatura.bare_extraction(html)
+    import pprint
+    pprint.pprint(doc)
+
+@cli.command()
+@click.argument("link")
+def externaldomains(link):
+    html = trafilatura.fetch_url(link,decode=True)
+    external_links = courlan.extract_links(html,link,external_bool=True,language=LANGUAGE)
+    domains = set()
+    for l in external_links:
+        r = courlan.check_url(l)
+        if r is None:
+            pass
+        link,domain = r
+        domains.add(domain)
+    for d in domains:
+        print(d)
+
@cli.command()
@click.argument("start_link")
 def parseurl(start_link):
Author	SHA1	Message	Date
Daniel Hladek	75840f6d21	merge	2023-03-25 13:50:12 +01:00
Daniel Hladek	0d477b1ab3	zz	2023-03-25 13:48:38 +01:00