zz
This commit is contained in:
parent
9b578a7660
commit
6eca731e42
@ -141,6 +141,9 @@ def fetch_robot(base_url):
|
|||||||
# exceptions happening here
|
# exceptions happening here
|
||||||
try:
|
try:
|
||||||
rules.read()
|
rules.read()
|
||||||
|
print("GOT robot")
|
||||||
|
print(rules)
|
||||||
|
LOGGER.info('got robots')
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
LOGGER.error('cannot read robots.txt: %s', exc)
|
LOGGER.error('cannot read robots.txt: %s', exc)
|
||||||
rules = None
|
rules = None
|
||||||
@ -178,11 +181,13 @@ def index_pages(db,domain,extracted_pages):
|
|||||||
elif doc is None:
|
elif doc is None:
|
||||||
state = "content_error"
|
state = "content_error"
|
||||||
if doc is not None:
|
if doc is not None:
|
||||||
checksums,sizes = calculate_checksums(doc["text"])
|
text = doc["text"]
|
||||||
doc["text_size"] = len(doc["text"])
|
checksums,sizes = calculate_checksums(text)
|
||||||
|
doc["text_size"] = len(text)
|
||||||
doc["paragraph_checksums"] = checksums
|
doc["paragraph_checksums"] = checksums
|
||||||
doc["paragraph_sizes"] = sizes
|
doc["paragraph_sizes"] = sizes
|
||||||
if len(checksums) < 1:
|
goodsz = sum(sizes)
|
||||||
|
if len(text) < 200 or goodsz/len(text) < 0.3:
|
||||||
state = "trash"
|
state = "trash"
|
||||||
if state == "good":
|
if state == "good":
|
||||||
htdoc = get_link_doc(link,state)
|
htdoc = get_link_doc(link,state)
|
||||||
@ -212,13 +217,18 @@ def extract_links(link_batch,responses,domain,rules,default_status="frontlink"):
|
|||||||
#print(link,status)
|
#print(link,status)
|
||||||
links[link] = status
|
links[link] = status
|
||||||
outlinks = []
|
outlinks = []
|
||||||
|
badrobot = 0
|
||||||
|
badlink = 0
|
||||||
for link,status in links.items():
|
for link,status in links.items():
|
||||||
if not is_robot_good(link,rules):
|
if not is_robot_good(link,rules):
|
||||||
|
badrobot += 1
|
||||||
continue
|
continue
|
||||||
link = is_link_good(link)
|
link = is_link_good(link)
|
||||||
if link is None:
|
if link is None:
|
||||||
|
badlink += 1
|
||||||
continue
|
continue
|
||||||
outlinks.append((link,status))
|
outlinks.append((link,status))
|
||||||
|
print(f"{len(links)} total links, {badrobot} badrobot {badlink} badlinks")
|
||||||
return outlinks
|
return outlinks
|
||||||
|
|
||||||
def index_links(db,extracted_links):
|
def index_links(db,extracted_links):
|
||||||
|
Loading…
Reference in New Issue
Block a user