zz

2023-03-25 14:39:36 +01:00 · 2023-03-25 14:39:36 +01:00 · 964ebb5bfc
commit 964ebb5bfc
parent 75840f6d21
1 changed files with 13 additions and 20 deletions
--- a/mongo/mongocwarler.py
+++ b/mongo/mongocwarler.py
@ -133,8 +133,6 @@ def fetch_robot(base_url):
    # exceptions happening here
    try:
        rules.read()
-        print("GOT robot")
-        print(rules)
        LOGGER.info('got robots')
    except Exception as exc:
        LOGGER.error('cannot read robots.txt: %s', exc)
@ -149,8 +147,6 @@ def extract_pages(link_batch,responses):
        assert original_link is not None
        if html is not None:
            doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE)
-            print("html2doc")
-            print(text)
            if doc is not None:
                if not "text" in doc or len(doc["text"]) < MINTEXTSIZE:
                    # text too small
@ -273,10 +269,12 @@ def link_summary(db,hostname):
    ])
    for item in res:
        print(item)
+    print(">>>Domain Content")
    contentcol = db["content"]
    res = contentcol.aggregate([
-        {"$match":{"hostname":hostname}},
-        {"$group":{"_id":None,"text_size_sum":{"$sum":"text_size"}}},
+        {"$match":{"host":hostname}},
+        #{"$project": {"textsum":{"$sum":"$text_size"}}}
+        {"$group":{"_id":None,"text_size_sum":{"$sum":"$text_size"}}},
    ])
    for item in res:
        print(item)
@ -296,13 +294,21 @@ def createdb():
    contentcol = db["content"]
    contentcol.create_index("url",unique=True)
    #contentcol.create_index({"paragraph_checksums":1})
-    #contentcol.create_index({"hostname":1})
+    contentcol.create_index({"host":1})
    htmlcol = db["html"]
    htmlcol.create_index("url",unique=True)

@cli.command()
@click.argument("link")
 def parseurl(link):
+    link,hostname = courlan.check_url(link)
+    rawrules = trafilatura.fetch_url("https://"+ hostname + "/robots.txt")
+    print(rawrules)
+    rules = urllib.robotparser.RobotFileParser()
+    rules.parse(rawrules.split("\n"))
+    print(rules.can_fetch("*",link))
+    print(rules.site_maps())
+    print(rules.crawl_delay("*"))
    html = trafilatura.fetch_url(link,decode=True)
    doc = trafilatura.bare_extraction(html)
    import pprint
@ -323,19 +329,6 @@ def externaldomains(link):
    for d in domains:
        print(d)

-@cli.command()
-@click.argument("start_link")
-def parseurl(start_link):
-    link,hostname = courlan.check_url(start_link)
-    links = [link]
-    responses = fetch_pages(links)
-    #pprint.pprint(responses)
-    extracted_pages = extract_pages(links,responses)
-    for ol,bl,html,doc in extracted_pages:
-        pprint.pprint(doc)
-    extracted_links = extract_links(links,responses,hostname,None,"backlink")
-    pprint.pprint(extracted_links)
-

@cli.command()
@click.argument("start_link")