diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 71f3f3e..6c41301 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -141,6 +141,9 @@ def fetch_robot(base_url): # exceptions happening here try: rules.read() + print("GOT robot") + print(rules) + LOGGER.info('got robots') except Exception as exc: LOGGER.error('cannot read robots.txt: %s', exc) rules = None @@ -178,11 +181,13 @@ def index_pages(db,domain,extracted_pages): elif doc is None: state = "content_error" if doc is not None: - checksums,sizes = calculate_checksums(doc["text"]) - doc["text_size"] = len(doc["text"]) + text = doc["text"] + checksums,sizes = calculate_checksums(text) + doc["text_size"] = len(text) doc["paragraph_checksums"] = checksums doc["paragraph_sizes"] = sizes - if len(checksums) < 1: + goodsz = sum(sizes) + if len(text) < 200 or goodsz/len(text) < 0.3: state = "trash" if state == "good": htdoc = get_link_doc(link,state) @@ -212,13 +217,18 @@ def extract_links(link_batch,responses,domain,rules,default_status="frontlink"): #print(link,status) links[link] = status outlinks = [] + badrobot = 0 + badlink = 0 for link,status in links.items(): if not is_robot_good(link,rules): + badrobot += 1 continue link = is_link_good(link) if link is None: + badlink += 1 continue outlinks.append((link,status)) + print(f"{len(links)} total links, {badrobot} badrobot {badlink} badlinks") return outlinks def index_links(db,extracted_links):