zz
This commit is contained in:
		
							parent
							
								
									75840f6d21
								
							
						
					
					
						commit
						964ebb5bfc
					
				@ -133,8 +133,6 @@ def fetch_robot(base_url):
 | 
			
		||||
    # exceptions happening here
 | 
			
		||||
    try:
 | 
			
		||||
        rules.read()
 | 
			
		||||
        print("GOT robot")
 | 
			
		||||
        print(rules)
 | 
			
		||||
        LOGGER.info('got robots')
 | 
			
		||||
    except Exception as exc:
 | 
			
		||||
        LOGGER.error('cannot read robots.txt: %s', exc)
 | 
			
		||||
@ -149,8 +147,6 @@ def extract_pages(link_batch,responses):
 | 
			
		||||
        assert original_link is not None
 | 
			
		||||
        if html is not None:
 | 
			
		||||
            doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE)
 | 
			
		||||
            print("html2doc")
 | 
			
		||||
            print(text)
 | 
			
		||||
            if doc is not None:
 | 
			
		||||
                if not "text" in doc or len(doc["text"]) < MINTEXTSIZE:
 | 
			
		||||
                    # text too small
 | 
			
		||||
@ -273,10 +269,12 @@ def link_summary(db,hostname):
 | 
			
		||||
    ])
 | 
			
		||||
    for item in res:
 | 
			
		||||
        print(item)
 | 
			
		||||
    print(">>>Domain Content")
 | 
			
		||||
    contentcol = db["content"]
 | 
			
		||||
    res = contentcol.aggregate([
 | 
			
		||||
        {"$match":{"hostname":hostname}},
 | 
			
		||||
        {"$group":{"_id":None,"text_size_sum":{"$sum":"text_size"}}},
 | 
			
		||||
        {"$match":{"host":hostname}},
 | 
			
		||||
        #{"$project": {"textsum":{"$sum":"$text_size"}}}
 | 
			
		||||
        {"$group":{"_id":None,"text_size_sum":{"$sum":"$text_size"}}},
 | 
			
		||||
    ])
 | 
			
		||||
    for item in res:
 | 
			
		||||
        print(item)
 | 
			
		||||
@ -296,13 +294,21 @@ def createdb():
 | 
			
		||||
    contentcol = db["content"]
 | 
			
		||||
    contentcol.create_index("url",unique=True)
 | 
			
		||||
    #contentcol.create_index({"paragraph_checksums":1})
 | 
			
		||||
    #contentcol.create_index({"hostname":1})
 | 
			
		||||
    contentcol.create_index({"host":1})
 | 
			
		||||
    htmlcol = db["html"]
 | 
			
		||||
    htmlcol.create_index("url",unique=True)
 | 
			
		||||
 | 
			
		||||
@cli.command()
 | 
			
		||||
@click.argument("link")
 | 
			
		||||
def parseurl(link):
 | 
			
		||||
    link,hostname = courlan.check_url(link)
 | 
			
		||||
    rawrules = trafilatura.fetch_url("https://"+ hostname + "/robots.txt")
 | 
			
		||||
    print(rawrules)
 | 
			
		||||
    rules = urllib.robotparser.RobotFileParser()
 | 
			
		||||
    rules.parse(rawrules.split("\n"))
 | 
			
		||||
    print(rules.can_fetch("*",link))
 | 
			
		||||
    print(rules.site_maps())
 | 
			
		||||
    print(rules.crawl_delay("*"))
 | 
			
		||||
    html = trafilatura.fetch_url(link,decode=True)
 | 
			
		||||
    doc = trafilatura.bare_extraction(html)
 | 
			
		||||
    import pprint
 | 
			
		||||
@ -323,19 +329,6 @@ def externaldomains(link):
 | 
			
		||||
    for d in domains:
 | 
			
		||||
        print(d)
 | 
			
		||||
 | 
			
		||||
@cli.command()
 | 
			
		||||
@click.argument("start_link")
 | 
			
		||||
def parseurl(start_link):
 | 
			
		||||
    link,hostname = courlan.check_url(start_link)
 | 
			
		||||
    links = [link]
 | 
			
		||||
    responses = fetch_pages(links)
 | 
			
		||||
    #pprint.pprint(responses)
 | 
			
		||||
    extracted_pages = extract_pages(links,responses)
 | 
			
		||||
    for ol,bl,html,doc in extracted_pages:
 | 
			
		||||
        pprint.pprint(doc)
 | 
			
		||||
    extracted_links = extract_links(links,responses,hostname,None,"backlink")
 | 
			
		||||
    pprint.pprint(extracted_links)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@cli.command()
 | 
			
		||||
@click.argument("start_link")
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user