zz
This commit is contained in:
parent
9b578a7660
commit
6eca731e42
@ -141,6 +141,9 @@ def fetch_robot(base_url):
|
||||
# exceptions happening here
|
||||
try:
|
||||
rules.read()
|
||||
print("GOT robot")
|
||||
print(rules)
|
||||
LOGGER.info('got robots')
|
||||
except Exception as exc:
|
||||
LOGGER.error('cannot read robots.txt: %s', exc)
|
||||
rules = None
|
||||
@ -178,11 +181,13 @@ def index_pages(db,domain,extracted_pages):
|
||||
elif doc is None:
|
||||
state = "content_error"
|
||||
if doc is not None:
|
||||
checksums,sizes = calculate_checksums(doc["text"])
|
||||
doc["text_size"] = len(doc["text"])
|
||||
text = doc["text"]
|
||||
checksums,sizes = calculate_checksums(text)
|
||||
doc["text_size"] = len(text)
|
||||
doc["paragraph_checksums"] = checksums
|
||||
doc["paragraph_sizes"] = sizes
|
||||
if len(checksums) < 1:
|
||||
goodsz = sum(sizes)
|
||||
if len(text) < 200 or goodsz/len(text) < 0.3:
|
||||
state = "trash"
|
||||
if state == "good":
|
||||
htdoc = get_link_doc(link,state)
|
||||
@ -212,13 +217,18 @@ def extract_links(link_batch,responses,domain,rules,default_status="frontlink"):
|
||||
#print(link,status)
|
||||
links[link] = status
|
||||
outlinks = []
|
||||
badrobot = 0
|
||||
badlink = 0
|
||||
for link,status in links.items():
|
||||
if not is_robot_good(link,rules):
|
||||
badrobot += 1
|
||||
continue
|
||||
link = is_link_good(link)
|
||||
if link is None:
|
||||
badlink += 1
|
||||
continue
|
||||
outlinks.append((link,status))
|
||||
print(f"{len(links)} total links, {badrobot} badrobot {badlink} badlinks")
|
||||
return outlinks
|
||||
|
||||
def index_links(db,extracted_links):
|
||||
|
Loading…
Reference in New Issue
Block a user