zz

2023-03-25 14:39:36 +01:00 · 2023-03-25 14:39:36 +01:00 · 964ebb5bfc
commit 964ebb5bfc
parent 75840f6d21
1 changed files with 13 additions and 20 deletions
--- a/mongo/mongocwarler.py
+++ b/mongo/mongocwarler.py
@ -133,8 +133,6 @@ def fetch_robot(base_url):
    # exceptions happening here
    try:
        rules.read()
        print("GOT robot")
        print(rules)
        LOGGER.info('got robots')
    except Exception as exc:
        LOGGER.error('cannot read robots.txt: %s', exc)
@ -149,8 +147,6 @@ def extract_pages(link_batch,responses):
        assert original_link is not None
        if html is not None:
            doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE)
            print("html2doc")
            print(text)
            if doc is not None:
                if not "text" in doc or len(doc["text"]) < MINTEXTSIZE:
                    # text too small
@ -273,10 +269,12 @@ def link_summary(db,hostname):
    ])
    for item in res:
        print(item)
    print(">>>Domain Content")
    contentcol = db["content"]
    res = contentcol.aggregate([
-        {"$match":{"hostname":hostname}},
+        {"$match":{"host":hostname}},
-        {"$group":{"_id":None,"text_size_sum":{"$sum":"text_size"}}},
+        #{"$project": {"textsum":{"$sum":"$text_size"}}}
        {"$group":{"_id":None,"text_size_sum":{"$sum":"$text_size"}}},
    ])
    for item in res:
        print(item)
@ -296,13 +294,21 @@ def createdb():
    contentcol = db["content"]
    contentcol.create_index("url",unique=True)
    #contentcol.create_index({"paragraph_checksums":1})
-    #contentcol.create_index({"hostname":1})
+    contentcol.create_index({"host":1})
    htmlcol = db["html"]
    htmlcol.create_index("url",unique=True)
@cli.command()
@click.argument("link")
 def parseurl(link):
    link,hostname = courlan.check_url(link)
    rawrules = trafilatura.fetch_url("https://"+ hostname + "/robots.txt")
    print(rawrules)
    rules = urllib.robotparser.RobotFileParser()
    rules.parse(rawrules.split("\n"))
    print(rules.can_fetch("*",link))
    print(rules.site_maps())
    print(rules.crawl_delay("*"))
    html = trafilatura.fetch_url(link,decode=True)
    doc = trafilatura.bare_extraction(html)
    import pprint
@ -323,19 +329,6 @@ def externaldomains(link):
    for d in domains:
        print(d)
@cli.command()
@click.argument("start_link")
 def parseurl(start_link):
    link,hostname = courlan.check_url(start_link)
    links = [link]
    responses = fetch_pages(links)
    #pprint.pprint(responses)
    extracted_pages = extract_pages(links,responses)
    for ol,bl,html,doc in extracted_pages:
        pprint.pprint(doc)
    extracted_links = extract_links(links,responses,hostname,None,"backlink")
    pprint.pprint(extracted_links)
@cli.command()
@click.argument("start_link")