zz
This commit is contained in:
		
							parent
							
								
									3993a61899
								
							
						
					
					
						commit
						44fbf6b755
					
				| @ -9,10 +9,14 @@ import courlan | |||||||
| import urllib | import urllib | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| import click | import click | ||||||
|  | import logging as LOGGER | ||||||
|  | import os | ||||||
| 
 | 
 | ||||||
| LANGUAGE="sk" | LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") | ||||||
| DOMAIN = "sk" | DOMAIN = os.getenv("SUCKER_DOMAIN","sk") | ||||||
| BATCHSIZE=10 | BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",10) | ||||||
|  | CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/") | ||||||
|  | DBNAME=os.getenv("SUCKER_DBNAME","crawler") | ||||||
| MINFILESIZE=300 | MINFILESIZE=300 | ||||||
| MAXFILESIZE=10000000 | MAXFILESIZE=10000000 | ||||||
| MINTEXTSIZE=200 | MINTEXTSIZE=200 | ||||||
| @ -48,7 +52,7 @@ def calculate_checksums(text): | |||||||
| 
 | 
 | ||||||
| def is_robot_good(link,rules): | def is_robot_good(link,rules): | ||||||
|     # check robots.txt rules |     # check robots.txt rules | ||||||
|     if rules is not None and not rules.can_fetch("*", llink): |     if rules is not None and not rules.can_fetch("*", link): | ||||||
|         return False |         return False | ||||||
|     return True |     return True | ||||||
| 
 | 
 | ||||||
| @ -61,10 +65,10 @@ def is_link_good(link): | |||||||
|     print(llink,ldomain) |     print(llink,ldomain) | ||||||
|     # domain rules |     # domain rules | ||||||
|     if not ldomain.endswith(DOMAIN): |     if not ldomain.endswith(DOMAIN): | ||||||
|         print("bad domain") |         LOGGER.debug("bad domain") | ||||||
|         return None |         return None | ||||||
|     if courlan.is_not_crawlable(llink): |     if courlan.is_not_crawlable(llink): | ||||||
|         print("not crawlable") |         LOGGER.debug("not crawlable") | ||||||
|         return None |         return None | ||||||
|     return llink |     return llink | ||||||
| 
 | 
 | ||||||
| @ -112,14 +116,14 @@ def fetch_pages(link_batch): | |||||||
|             good = True |             good = True | ||||||
|             if response.status != 200: |             if response.status != 200: | ||||||
|                 good = False |                 good = False | ||||||
|                 #LOGGER.error('not a 200 response: %s for URL %s', response.status, url) |                 LOGGER.error('not a 200 response: %s for URL %s', response.status, url) | ||||||
|             elif response.data is None or len(response.data) < MINFILESIZE: |             elif response.data is None or len(response.data) < MINFILESIZE: | ||||||
|                 #LOGGER.error('too small/incorrect for URL %s', url) |                 LOGGER.error('too small/incorrect for URL %s', url) | ||||||
|                 good = False |                 good = False | ||||||
|             # raise error instead? |             # raise error instead? | ||||||
|             elif len(response.data) > MAXFILESIZE: |             elif len(response.data) > MAXFILESIZE: | ||||||
|                 good = False |                 good = False | ||||||
|             #LOGGER.error('too large: length %s for URL %s', len(response.data), url) |                 LOGGER.error('too large: length %s for URL %s', len(response.data), url) | ||||||
|             if good: |             if good: | ||||||
|                 html = trafilatura.utils.decode_response(response)  |                 html = trafilatura.utils.decode_response(response)  | ||||||
|                 final_link = response.url |                 final_link = response.url | ||||||
| @ -133,12 +137,12 @@ def fetch_pages(link_batch): | |||||||
| 
 | 
 | ||||||
| def fetch_robot(base_url): | def fetch_robot(base_url): | ||||||
|     rules = urllib.robotparser.RobotFileParser() |     rules = urllib.robotparser.RobotFileParser() | ||||||
|     rules.set_url(base_url + '/robots.txt') |     rules.set_url("https://" + base_url + '/robots.txt') | ||||||
|     # exceptions happening here |     # exceptions happening here | ||||||
|     try: |     try: | ||||||
|         rules.read() |         rules.read() | ||||||
|     except Exception as exc: |     except Exception as exc: | ||||||
|         #LOGGER.error('cannot read robots.txt: %s', exc) |         LOGGER.error('cannot read robots.txt: %s', exc) | ||||||
|         rules = None |         rules = None | ||||||
|     return rules |     return rules | ||||||
| 
 | 
 | ||||||
| @ -166,25 +170,31 @@ def index_pages(db,domain,extracted_pages): | |||||||
|     links = [] |     links = [] | ||||||
|     for original_link,final_link,html,doc in extracted_pages: |     for original_link,final_link,html,doc in extracted_pages: | ||||||
|         state = "good" |         state = "good" | ||||||
|  |         link = original_link | ||||||
|  |         if original_link != final_link: | ||||||
|  |             linkcol.insert_one(get_link_doc(original_link,"redirect")) | ||||||
|  |             link = final_link | ||||||
|         if html is None: |         if html is None: | ||||||
|             state = "html_error" |             state = "html_error" | ||||||
|         elif doc is None: |         elif doc is None: | ||||||
|             state = "content_error" |             state = "content_error" | ||||||
|         if original_link != final_link: |  | ||||||
|             linkcol.insert_one(get_link_doc(final_link,state)) |  | ||||||
|             state = "redirect" |  | ||||||
|         linkcol.update_one({"url":original_link},{"$set":{"status":state}}) |  | ||||||
|         if doc is not None: |         if doc is not None: | ||||||
|             if html is not None: |  | ||||||
|                 htmlcol.insert_one({"url":final_link,"html":html,"html_size":len(html),"created_at":datetime.utcnow()}) |  | ||||||
|             checksums,sizes = calculate_checksums(doc["text"]) |             checksums,sizes = calculate_checksums(doc["text"]) | ||||||
|             doc["created_at"] = datetime.utcnow() |  | ||||||
|             doc["text_size"] = len(doc["text"]) |             doc["text_size"] = len(doc["text"]) | ||||||
|             doc["paragraph_checksums"] = checksums |             doc["paragraph_checksums"] = checksums | ||||||
|             doc["paragraph_sizes"] = sizes |             doc["paragraph_sizes"] = sizes | ||||||
|  |             if len(checksums) < 1: | ||||||
|  |                 state = "trash" | ||||||
|  |         if state == "good": | ||||||
|  |             htdoc = get_link_doc(link,state) | ||||||
|  |             htdoc["html"] = html | ||||||
|  |             htdoc["html_size"] = len(html) | ||||||
|  |             htmlcol.insert_one(htdoc) | ||||||
|  |             doc.update(get_link_doc(link,"good")) | ||||||
|             # todo extract links |             # todo extract links | ||||||
|             print(doc) |             print(doc) | ||||||
|             contentcol.insert_one(doc) |             contentcol.insert_one(doc) | ||||||
|  |         linkcol.update_one({"url":original_link},{"$set":{"status":state}}) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def extract_links(link_batch,responses,domain,rules,default_status="frontlink"): | def extract_links(link_batch,responses,domain,rules,default_status="frontlink"): | ||||||
| @ -244,15 +254,32 @@ def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE): | |||||||
| def link_summary(db,domain): | def link_summary(db,domain): | ||||||
|     linkcol = db["links"] |     linkcol = db["links"] | ||||||
|     #res = linkcol.distinct("domain",{"hostname":domain}) |     #res = linkcol.distinct("domain",{"hostname":domain}) | ||||||
| 
 |      | ||||||
|  |     # count links | ||||||
|     res = linkcol.aggregate([ |     res = linkcol.aggregate([ | ||||||
|         {"$match":{"host":domain}}, |         {"$match":{"host":domain}}, | ||||||
|         {"$group":{"_id":"$status","count":{"$sum":1}}}, |         {"$group":{"_id":"$status","count":{"$sum":1}}}, | ||||||
|     ]) |     ]) | ||||||
|     for item in res: |     for item in res: | ||||||
|         print(item) |         print(item) | ||||||
|  |     contentcol = db["content"] | ||||||
|  |     res = contentcol.aggregate([ | ||||||
|  |         {"$match":{"hostname":domain}}, | ||||||
|  |         {"$group":{"_id":None,"text_size_sum":{"$sum":"text_size"}}}, | ||||||
|  |     ]) | ||||||
|  |     for item in res: | ||||||
|  |         print(item) | ||||||
| 
 | 
 | ||||||
| def create_indices(db): | global DB | ||||||
|  | 
 | ||||||
|  | @click.group() | ||||||
|  | def cli(): | ||||||
|  |     pass | ||||||
|  | 
 | ||||||
|  | @cli.command() | ||||||
|  | def dropdb(): | ||||||
|  |     myclient = pymongo.MongoClient(CONNECTION) | ||||||
|  |     db=myclient[DBNAME] | ||||||
|     linkcol = db["links"] |     linkcol = db["links"] | ||||||
|     linkcol.create_index({"url":1},{"name":"url"}) |     linkcol.create_index({"url":1},{"name":"url"}) | ||||||
|     linkcol.create_index({"host":1,"status":1},{"name":"hostname_status"}) |     linkcol.create_index({"host":1,"status":1},{"name":"hostname_status"}) | ||||||
| @ -263,17 +290,14 @@ def create_indices(db): | |||||||
|     htmlcol = db["html"] |     htmlcol = db["html"] | ||||||
|     htmlcol.create_index({"url":1}) |     htmlcol.create_index({"url":1}) | ||||||
| 
 | 
 | ||||||
| @click.group() | @cli.command() | ||||||
| def cli(): |  | ||||||
|     pass |  | ||||||
| 
 |  | ||||||
| @click.command() |  | ||||||
| @click.argument("start_link") | @click.argument("start_link") | ||||||
| def simple_visit(start_link): | def visit(start_link): | ||||||
|  |     myclient = pymongo.MongoClient(CONNECTION) | ||||||
|  |     db=myclient[DBNAME] | ||||||
|     start_link,domain = courlan.check_url(start_link) |     start_link,domain = courlan.check_url(start_link) | ||||||
|     myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/") |  | ||||||
|     db=myclient["crawler"] |  | ||||||
|     rules = fetch_robot(domain) |     rules = fetch_robot(domain) | ||||||
|  |     print(rules) | ||||||
|     batch_size = BATCHSIZE |     batch_size = BATCHSIZE | ||||||
|     navigation_links = get_links(db,domain,"navigation",batch_size) |     navigation_links = get_links(db,domain,"navigation",batch_size) | ||||||
|     if start_link is not None: |     if start_link is not None: | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user