diff --git a/mongo/mongocwarler.py b/mongo/mongocwarler.py index 07d4cbf..4466e0f 100644 --- a/mongo/mongocwarler.py +++ b/mongo/mongocwarler.py @@ -9,10 +9,14 @@ import courlan import urllib from datetime import datetime import click +import logging as LOGGER +import os -LANGUAGE="sk" -DOMAIN = "sk" -BATCHSIZE=10 +LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") +DOMAIN = os.getenv("SUCKER_DOMAIN","sk") +BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",10) +CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") +DBNAME=os.getenv("SUCKER_DBNAME","crawler") MINFILESIZE=300 MAXFILESIZE=10000000 MINTEXTSIZE=200 @@ -48,7 +52,7 @@ def calculate_checksums(text): def is_robot_good(link,rules): # check robots.txt rules - if rules is not None and not rules.can_fetch("*", llink): + if rules is not None and not rules.can_fetch("*", link): return False return True @@ -61,10 +65,10 @@ def is_link_good(link): print(llink,ldomain) # domain rules if not ldomain.endswith(DOMAIN): - print("bad domain") + LOGGER.debug("bad domain") return None if courlan.is_not_crawlable(llink): - print("not crawlable") + LOGGER.debug("not crawlable") return None return llink @@ -112,14 +116,14 @@ def fetch_pages(link_batch): good = True if response.status != 200: good = False - #LOGGER.error('not a 200 response: %s for URL %s', response.status, url) + LOGGER.error('not a 200 response: %s for URL %s', response.status, url) elif response.data is None or len(response.data) < MINFILESIZE: - #LOGGER.error('too small/incorrect for URL %s', url) + LOGGER.error('too small/incorrect for URL %s', url) good = False # raise error instead? elif len(response.data) > MAXFILESIZE: good = False - #LOGGER.error('too large: length %s for URL %s', len(response.data), url) + LOGGER.error('too large: length %s for URL %s', len(response.data), url) if good: html = trafilatura.utils.decode_response(response) final_link = response.url @@ -133,12 +137,12 @@ def fetch_pages(link_batch): def fetch_robot(base_url): rules = urllib.robotparser.RobotFileParser() - rules.set_url(base_url + '/robots.txt') + rules.set_url("https://" + base_url + '/robots.txt') # exceptions happening here try: rules.read() except Exception as exc: - #LOGGER.error('cannot read robots.txt: %s', exc) + LOGGER.error('cannot read robots.txt: %s', exc) rules = None return rules @@ -166,25 +170,31 @@ def index_pages(db,domain,extracted_pages): links = [] for original_link,final_link,html,doc in extracted_pages: state = "good" + link = original_link + if original_link != final_link: + linkcol.insert_one(get_link_doc(original_link,"redirect")) + link = final_link if html is None: state = "html_error" elif doc is None: state = "content_error" - if original_link != final_link: - linkcol.insert_one(get_link_doc(final_link,state)) - state = "redirect" - linkcol.update_one({"url":original_link},{"$set":{"status":state}}) if doc is not None: - if html is not None: - htmlcol.insert_one({"url":final_link,"html":html,"html_size":len(html),"created_at":datetime.utcnow()}) checksums,sizes = calculate_checksums(doc["text"]) - doc["created_at"] = datetime.utcnow() doc["text_size"] = len(doc["text"]) doc["paragraph_checksums"] = checksums doc["paragraph_sizes"] = sizes + if len(checksums) < 1: + state = "trash" + if state == "good": + htdoc = get_link_doc(link,state) + htdoc["html"] = html + htdoc["html_size"] = len(html) + htmlcol.insert_one(htdoc) + doc.update(get_link_doc(link,"good")) # todo extract links print(doc) contentcol.insert_one(doc) + linkcol.update_one({"url":original_link},{"$set":{"status":state}}) def extract_links(link_batch,responses,domain,rules,default_status="frontlink"): @@ -244,15 +254,32 @@ def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE): def link_summary(db,domain): linkcol = db["links"] #res = linkcol.distinct("domain",{"hostname":domain}) - + + # count links res = linkcol.aggregate([ {"$match":{"host":domain}}, {"$group":{"_id":"$status","count":{"$sum":1}}}, ]) for item in res: print(item) + contentcol = db["content"] + res = contentcol.aggregate([ + {"$match":{"hostname":domain}}, + {"$group":{"_id":None,"text_size_sum":{"$sum":"text_size"}}}, + ]) + for item in res: + print(item) -def create_indices(db): +global DB + +@click.group() +def cli(): + pass + +@cli.command() +def dropdb(): + myclient = pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] linkcol = db["links"] linkcol.create_index({"url":1},{"name":"url"}) linkcol.create_index({"host":1,"status":1},{"name":"hostname_status"}) @@ -263,17 +290,14 @@ def create_indices(db): htmlcol = db["html"] htmlcol.create_index({"url":1}) -@click.group() -def cli(): - pass - -@click.command() +@cli.command() @click.argument("start_link") -def simple_visit(start_link): +def visit(start_link): + myclient = pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] start_link,domain = courlan.check_url(start_link) - myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/") - db=myclient["crawler"] rules = fetch_robot(domain) + print(rules) batch_size = BATCHSIZE navigation_links = get_links(db,domain,"navigation",batch_size) if start_link is not None: