This commit is contained in:
Daniel Hládek 2023-03-25 14:39:36 +01:00
parent 75840f6d21
commit 964ebb5bfc

View File

@ -133,8 +133,6 @@ def fetch_robot(base_url):
# exceptions happening here # exceptions happening here
try: try:
rules.read() rules.read()
print("GOT robot")
print(rules)
LOGGER.info('got robots') LOGGER.info('got robots')
except Exception as exc: except Exception as exc:
LOGGER.error('cannot read robots.txt: %s', exc) LOGGER.error('cannot read robots.txt: %s', exc)
@ -149,8 +147,6 @@ def extract_pages(link_batch,responses):
assert original_link is not None assert original_link is not None
if html is not None: if html is not None:
doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE) doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE)
print("html2doc")
print(text)
if doc is not None: if doc is not None:
if not "text" in doc or len(doc["text"]) < MINTEXTSIZE: if not "text" in doc or len(doc["text"]) < MINTEXTSIZE:
# text too small # text too small
@ -273,10 +269,12 @@ def link_summary(db,hostname):
]) ])
for item in res: for item in res:
print(item) print(item)
print(">>>Domain Content")
contentcol = db["content"] contentcol = db["content"]
res = contentcol.aggregate([ res = contentcol.aggregate([
{"$match":{"hostname":hostname}}, {"$match":{"host":hostname}},
{"$group":{"_id":None,"text_size_sum":{"$sum":"text_size"}}}, #{"$project": {"textsum":{"$sum":"$text_size"}}}
{"$group":{"_id":None,"text_size_sum":{"$sum":"$text_size"}}},
]) ])
for item in res: for item in res:
print(item) print(item)
@ -296,13 +294,21 @@ def createdb():
contentcol = db["content"] contentcol = db["content"]
contentcol.create_index("url",unique=True) contentcol.create_index("url",unique=True)
#contentcol.create_index({"paragraph_checksums":1}) #contentcol.create_index({"paragraph_checksums":1})
#contentcol.create_index({"hostname":1}) contentcol.create_index({"host":1})
htmlcol = db["html"] htmlcol = db["html"]
htmlcol.create_index("url",unique=True) htmlcol.create_index("url",unique=True)
@cli.command() @cli.command()
@click.argument("link") @click.argument("link")
def parseurl(link): def parseurl(link):
link,hostname = courlan.check_url(link)
rawrules = trafilatura.fetch_url("https://"+ hostname + "/robots.txt")
print(rawrules)
rules = urllib.robotparser.RobotFileParser()
rules.parse(rawrules.split("\n"))
print(rules.can_fetch("*",link))
print(rules.site_maps())
print(rules.crawl_delay("*"))
html = trafilatura.fetch_url(link,decode=True) html = trafilatura.fetch_url(link,decode=True)
doc = trafilatura.bare_extraction(html) doc = trafilatura.bare_extraction(html)
import pprint import pprint
@ -323,19 +329,6 @@ def externaldomains(link):
for d in domains: for d in domains:
print(d) print(d)
@cli.command()
@click.argument("start_link")
def parseurl(start_link):
link,hostname = courlan.check_url(start_link)
links = [link]
responses = fetch_pages(links)
#pprint.pprint(responses)
extracted_pages = extract_pages(links,responses)
for ol,bl,html,doc in extracted_pages:
pprint.pprint(doc)
extracted_links = extract_links(links,responses,hostname,None,"backlink")
pprint.pprint(extracted_links)
@cli.command() @cli.command()
@click.argument("start_link") @click.argument("start_link")