This commit is contained in:
Daniel Hládek 2023-03-14 10:59:58 +01:00
parent 9b578a7660
commit 6eca731e42

View File

@ -141,6 +141,9 @@ def fetch_robot(base_url):
# exceptions happening here # exceptions happening here
try: try:
rules.read() rules.read()
print("GOT robot")
print(rules)
LOGGER.info('got robots')
except Exception as exc: except Exception as exc:
LOGGER.error('cannot read robots.txt: %s', exc) LOGGER.error('cannot read robots.txt: %s', exc)
rules = None rules = None
@ -178,11 +181,13 @@ def index_pages(db,domain,extracted_pages):
elif doc is None: elif doc is None:
state = "content_error" state = "content_error"
if doc is not None: if doc is not None:
checksums,sizes = calculate_checksums(doc["text"]) text = doc["text"]
doc["text_size"] = len(doc["text"]) checksums,sizes = calculate_checksums(text)
doc["text_size"] = len(text)
doc["paragraph_checksums"] = checksums doc["paragraph_checksums"] = checksums
doc["paragraph_sizes"] = sizes doc["paragraph_sizes"] = sizes
if len(checksums) < 1: goodsz = sum(sizes)
if len(text) < 200 or goodsz/len(text) < 0.3:
state = "trash" state = "trash"
if state == "good": if state == "good":
htdoc = get_link_doc(link,state) htdoc = get_link_doc(link,state)
@ -212,13 +217,18 @@ def extract_links(link_batch,responses,domain,rules,default_status="frontlink"):
#print(link,status) #print(link,status)
links[link] = status links[link] = status
outlinks = [] outlinks = []
badrobot = 0
badlink = 0
for link,status in links.items(): for link,status in links.items():
if not is_robot_good(link,rules): if not is_robot_good(link,rules):
badrobot += 1
continue continue
link = is_link_good(link) link = is_link_good(link)
if link is None: if link is None:
badlink += 1
continue continue
outlinks.append((link,status)) outlinks.append((link,status))
print(f"{len(links)} total links, {badrobot} badrobot {badlink} badlinks")
return outlinks return outlinks
def index_links(db,extracted_links): def index_links(db,extracted_links):