266 lines
		
	
	
		
			8.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			266 lines
		
	
	
		
			8.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pymongo
 | |
| import trafilatura
 | |
| import trafilatura.feeds
 | |
| import trafilatura.sitemaps
 | |
| import trafilatura.spider
 | |
| import trafilatura.utils
 | |
| import sys
 | |
| import courlan
 | |
| import urllib
 | |
| 
 | |
| LANGUAGE="sk"
 | |
| DOMAIN = "sk"
 | |
| BATCHSIZE=10
 | |
| MINFILESIZE=300
 | |
| MAXFILESIZE=1000000
 | |
| 
 | |
| def calculate_checksums(text):
 | |
|     """
 | |
|     @return fingerprints of a paragraphs in text. Paragraphs are separated by a blank line
 | |
|     """
 | |
|     checksums = []
 | |
|     sizes = []
 | |
|     hval = 0
 | |
|     hsz = 0
 | |
|     sz = 0
 | |
|     for c in text:
 | |
|         cv = ord(c)
 | |
|         sz += 1
 | |
|         if cv > 64:
 | |
|             hval += (hval << 3) + cv
 | |
|             zv = hval >> 31
 | |
|             hval &= 0x7fffffff
 | |
|             hval += zv
 | |
|             hsz += 1
 | |
|         if c == "\n" and hsz > 0:
 | |
|             if hsz > 100:
 | |
|                 checksums.append(hval)
 | |
|                 sizes.append(sz)
 | |
|             sz = 0
 | |
|             hsz = 0
 | |
|     if hsz > 100:
 | |
|         checksums.append(hval)
 | |
|         sizes.append(sz)
 | |
|     return checksums, sizes
 | |
| 
 | |
| def is_robot_good(link,rules):
 | |
|     # check robots.txt rules
 | |
|     if rules is not None and not rules.can_fetch("*", llink):
 | |
|         return False
 | |
|     return True
 | |
| 
 | |
| def is_link_good(link):
 | |
|     r = courlan.check_url(link,strict=True,language=LANGUAGE)
 | |
|     if r is None:
 | |
|         print(link)
 | |
|         return None
 | |
|     llink,ldomain = r
 | |
|     print(llink,ldomain)
 | |
|     # domain rules
 | |
|     if not ldomain.endswith(DOMAIN):
 | |
|         print("bad domain")
 | |
|         return None
 | |
|     if courlan.is_not_crawlable(llink):
 | |
|         print("not crawlable")
 | |
|         return None
 | |
|     return llink
 | |
| 
 | |
| def filter_links(links,rules=None):
 | |
|     out = set()
 | |
|     for link in links:
 | |
|         r = is_link_good(link)
 | |
|         if r is None:
 | |
|             continue
 | |
|         # check robots.txt rules
 | |
|         if rules is not None and not rules.can_fetch("*", r):
 | |
|             continue
 | |
|         out.add(llink)
 | |
|     return out
 | |
| 
 | |
| 
 | |
| def get_link_doc(link,status="frontlink"):
 | |
|     r  = courlan.check_url(link)
 | |
|     assert r is not None
 | |
|     link,host = r
 | |
|     domain = courlan.extract_domain(link)
 | |
|     return {"url":link,"host":host,"domain":domain,"status":status}
 | |
| 
 | |
| def generic_visit(domain):
 | |
|     known_links = set(get_visited_links(domain))
 | |
|     visit_links = []
 | |
|     visit_links = trafilatura.find_feed_urls(domain)
 | |
|     if visit_links is None:
 | |
|         visit_links = trafilatura.sitemap_search(domain)
 | |
|     if visit_links is None:
 | |
|         visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
 | |
| 
 | |
| 
 | |
| def fetch_pages(link_batch):
 | |
|     htmls  = []
 | |
|     #print(link_batch)
 | |
|     #print("zzzzzzzzzz")
 | |
|     for link in link_batch:
 | |
|         print("fetching:::::")
 | |
|         print(link)
 | |
|         final_link = link
 | |
|         response = trafilatura.fetch_url(link,decode=False)
 | |
|         html = None
 | |
|         if response is not None :
 | |
|             good = True
 | |
|             if response.status != 200:
 | |
|                 good = False
 | |
|                 #LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
 | |
|             elif response.data is None or len(response.data) < MINFILESIZE:
 | |
|                 #LOGGER.error('too small/incorrect for URL %s', url)
 | |
|                 good = False
 | |
|             # raise error instead?
 | |
|             elif len(response.data) > MAXFILESIZE:
 | |
|                 good = False
 | |
|             #LOGGER.error('too large: length %s for URL %s', len(response.data), url)
 | |
|             if good:
 | |
|                 html = trafilatura.utils.decode_response(response) 
 | |
|                 final_link = response.url
 | |
|             if html is not None:
 | |
|                 html, final_link = trafilatura.spider.refresh_detection(html, final_link)
 | |
|                 # is there a meta-refresh on the page?
 | |
|                 if final_link is None:  # malformed or malicious content
 | |
|                     html = None
 | |
|         htmls.append((final_link,html))
 | |
|     return htmls
 | |
| 
 | |
| def fetch_robot(base_url):
 | |
|     rules = urllib.robotparser.RobotFileParser()
 | |
|     rules.set_url(base_url + '/robots.txt')
 | |
|     # exceptions happening here
 | |
|     try:
 | |
|         rules.read()
 | |
|     except Exception as exc:
 | |
|         #LOGGER.error('cannot read robots.txt: %s', exc)
 | |
|         rules = None
 | |
|     return rules
 | |
| 
 | |
| 
 | |
| def extract_pages(link_batch,responses):
 | |
|     out = []
 | |
|     for original_link,(final_link,html) in zip(link_batch,responses):
 | |
|         doc = None
 | |
|         assert original_link is not None
 | |
|         if html is not None:
 | |
|             doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE)
 | |
|         out.append((original_link,final_link,html,doc))
 | |
|     return out
 | |
| 
 | |
| 
 | |
| def index_pages(db,domain,extracted_pages):
 | |
|     extracted_links = set()
 | |
|     linkcol = db["links"]
 | |
|     htmlcol = db["html"]
 | |
|     contentcol = db["content"]
 | |
|     links = []
 | |
|     for original_link,final_link,html,doc in extracted_pages:
 | |
|         state = "good"
 | |
|         if html is None:
 | |
|             state = "html_error"
 | |
|         elif doc is None:
 | |
|             state = "content_error"
 | |
|         if original_link != final_link:
 | |
|             linkcol.insert_one(get_link_doc(final_link,state))
 | |
|             state = "redirect"
 | |
|         linkcol.update_one({"url":original_link},{"$set":{"status":state}})
 | |
|         if html is not None:
 | |
|             htmlcol.insert_one({"url":final_link,"html":html})
 | |
|         if doc is not None:
 | |
|             checksums,sizes = calculate_checksums(doc["text"])
 | |
|             doc["paragraph_checksums"] = checksums
 | |
|             doc["paragraph_sizes"] = sizes
 | |
|             # todo extract links
 | |
|             print(doc)
 | |
|             contentcol.insert_one(doc)
 | |
| 
 | |
| 
 | |
| def extract_links(link_batch,responses,domain,rules,default_status="frontlink"):
 | |
|     links = {}
 | |
|     for original_link,(final_link,html) in zip(link_batch,responses):
 | |
|         status = default_status
 | |
|         extracted_links = courlan.extract_links(html,final_link,False,language=LANGUAGE)
 | |
|         #print(extracted_links)
 | |
|         for link in extracted_links:
 | |
|             if courlan.is_external(link,domain):
 | |
|                 status = "frontlink"
 | |
|             elif courlan.is_navigation(link):
 | |
|                 status = "navigation"
 | |
|             #print(link,status)
 | |
|             links[link] = status
 | |
|     outlinks = []
 | |
|     for link,status in links.items():
 | |
|         if not is_robot_good(link,rules):
 | |
|             continue
 | |
|         link = is_link_good(link)
 | |
|         if link is None:
 | |
|             continue
 | |
|         outlinks.append((link,status))
 | |
|     return outlinks
 | |
| 
 | |
| def index_links(db,extracted_links):
 | |
|     linkcol=db["links"]
 | |
|     for link,status in extracted_links:
 | |
|         doc = get_link_doc(link,status)
 | |
|         linkcol.insert_one(doc)
 | |
| 
 | |
| 
 | |
| def get_links(db,domain,status,batch_size=BATCHSIZE):
 | |
|     linkcol = db["links"]
 | |
|     res  = linkcol.find({"status":status,"host":domain},{"url":1},limit=batch_size)
 | |
|     links = []
 | |
|     for doc in res:
 | |
|         print(">>>>>" + status)
 | |
|         print(doc)
 | |
|         links.append(doc["url"])
 | |
|     return links
 | |
| 
 | |
| 
 | |
| 
 | |
| def process_links(db,domain,status,links=[],rules=None,batch_size=BATCHSIZE):
 | |
|     links += get_links(db,domain,status,batch_size)
 | |
|     #print(links)
 | |
|     responses = fetch_pages(links)
 | |
|     #print(responses)
 | |
|     extracted_pages = extract_pages(links,responses)
 | |
|     #print(extracted_pages)
 | |
|     extracted_links = extract_links(links,responses,domain,rules,status)
 | |
|     print(extracted_links)
 | |
|     index_links(db,extracted_links)
 | |
|     index_pages(db,domain,extracted_pages)
 | |
| 
 | |
| def simple_visit(start_link):
 | |
|     start_link,domain = courlan.check_url(start_link)
 | |
|     myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/")
 | |
|     db=myclient["crawler"]
 | |
|     rules = fetch_robot(domain)
 | |
|     navigation_links =[start_link]
 | |
|     print(navigation_links)
 | |
|     process_links(db,domain,"navigation",navigation_links,rules)
 | |
|     process_links(db,domain,"frontlink",rules=rules)
 | |
|     process_links(db,domain,"backlink",rules=rules)
 | |
| 
 | |
| def create_indices(db):
 | |
|     linkcol = db["links"]
 | |
|     linkcol.create_index({"url":1},{"name":"url"})
 | |
|     linkcol.create_index({"hostname":1,"status":1},{"name":"hostname_status"})
 | |
|     contentcol = db["content"]
 | |
|     contentcol.create_index({"url":1})
 | |
|     contentcol.create_index({"paragraph_checksums":1})
 | |
|     contentcol.create_index({"domain":1})
 | |
|     htmlcol = db["html"]
 | |
|     htmlcol.create_index({"url":1})
 | |
| 
 | |
| def link_summary(db,domain):
 | |
|     linkcol = db["links"]
 | |
|     res = linkcol.aggregate([
 | |
|         {"$match":{"hostname":domain}},
 | |
|         {"$group":{"_id":"status":domain,"count":{"$count":1}}},
 | |
|     ])
 | |
|     print(res)
 | |
| 
 | |
| simple_visit(sys.argv[1])
 |