This commit is contained in:
Daniel Hládek 2023-03-25 14:39:36 +01:00
parent 75840f6d21
commit 964ebb5bfc

View File

@ -133,8 +133,6 @@ def fetch_robot(base_url):
# exceptions happening here
try:
rules.read()
print("GOT robot")
print(rules)
LOGGER.info('got robots')
except Exception as exc:
LOGGER.error('cannot read robots.txt: %s', exc)
@ -149,8 +147,6 @@ def extract_pages(link_batch,responses):
assert original_link is not None
if html is not None:
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE)
print("html2doc")
print(text)
if doc is not None:
if not "text" in doc or len(doc["text"]) < MINTEXTSIZE:
# text too small
@ -273,10 +269,12 @@ def link_summary(db,hostname):
])
for item in res:
print(item)
print(">>>Domain Content")
contentcol = db["content"]
res = contentcol.aggregate([
{"$match":{"hostname":hostname}},
{"$group":{"_id":None,"text_size_sum":{"$sum":"text_size"}}},
{"$match":{"host":hostname}},
#{"$project": {"textsum":{"$sum":"$text_size"}}}
{"$group":{"_id":None,"text_size_sum":{"$sum":"$text_size"}}},
])
for item in res:
print(item)
@ -296,13 +294,21 @@ def createdb():
contentcol = db["content"]
contentcol.create_index("url",unique=True)
#contentcol.create_index({"paragraph_checksums":1})
#contentcol.create_index({"hostname":1})
contentcol.create_index({"host":1})
htmlcol = db["html"]
htmlcol.create_index("url",unique=True)
@cli.command()
@click.argument("link")
def parseurl(link):
link,hostname = courlan.check_url(link)
rawrules = trafilatura.fetch_url("https://"+ hostname + "/robots.txt")
print(rawrules)
rules = urllib.robotparser.RobotFileParser()
rules.parse(rawrules.split("\n"))
print(rules.can_fetch("*",link))
print(rules.site_maps())
print(rules.crawl_delay("*"))
html = trafilatura.fetch_url(link,decode=True)
doc = trafilatura.bare_extraction(html)
import pprint
@ -323,19 +329,6 @@ def externaldomains(link):
for d in domains:
print(d)
@cli.command()
@click.argument("start_link")
def parseurl(start_link):
link,hostname = courlan.check_url(start_link)
links = [link]
responses = fetch_pages(links)
#pprint.pprint(responses)
extracted_pages = extract_pages(links,responses)
for ol,bl,html,doc in extracted_pages:
pprint.pprint(doc)
extracted_links = extract_links(links,responses,hostname,None,"backlink")
pprint.pprint(extracted_links)
@cli.command()
@click.argument("start_link")