wip
This commit is contained in:
		
							parent
							
								
									c333e080f0
								
							
						
					
					
						commit
						3d351d5d50
					
				| @ -3,9 +3,12 @@ import trafilatura | |||||||
| import trafilatura.feeds | import trafilatura.feeds | ||||||
| import trafilatura.sitemaps | import trafilatura.sitemaps | ||||||
| import trafilatura.spider | import trafilatura.spider | ||||||
|  | import trafilatura.utils | ||||||
| import sys | import sys | ||||||
| import courlan | import courlan | ||||||
| 
 | 
 | ||||||
|  | LANGUAGE="sk" | ||||||
|  | BATCHSIZE=10 | ||||||
| 
 | 
 | ||||||
| def calculate_checksums(text): | def calculate_checksums(text): | ||||||
|     """ |     """ | ||||||
| @ -36,7 +39,7 @@ def calculate_checksums(text): | |||||||
|         sizes.append(sz) |         sizes.append(sz) | ||||||
|     return checksums, sizes |     return checksums, sizes | ||||||
| 
 | 
 | ||||||
| def filter_links(links,language="sk"): | def filter_links(links,language=LANGUAGE,rules=None): | ||||||
|     out = set() |     out = set() | ||||||
|     for link in links: |     for link in links: | ||||||
|         r = courlan.check_url(link,strict=True,language=language) |         r = courlan.check_url(link,strict=True,language=language) | ||||||
| @ -51,9 +54,21 @@ def filter_links(links,language="sk"): | |||||||
|         if courlan.is_not_crawlable(llink): |         if courlan.is_not_crawlable(llink): | ||||||
|             print("not crawlable") |             print("not crawlable") | ||||||
|             continue |             continue | ||||||
|  |         # check robots.txt rules | ||||||
|  |         if rules is not None and not rules.can_fetch("*", llink): | ||||||
|  |             continue | ||||||
|         out.add(llink) |         out.add(llink) | ||||||
|     return out |     return out | ||||||
| 
 | 
 | ||||||
|  | def sort_links(links,domain): | ||||||
|  |     for llink in filtered_links: | ||||||
|  |         doc = get_link_doc(link,"backlink") | ||||||
|  |         if courlan.is_external(link,domain): | ||||||
|  |             doc["status"]= "frontlink" | ||||||
|  |         elif courlan.is_navigation(link): | ||||||
|  |             doc["status"] = "navigation" | ||||||
|  |         linkcol.insert_one(doc) | ||||||
|  | 
 | ||||||
| def get_link_doc(link,status="frontlink"): | def get_link_doc(link,status="frontlink"): | ||||||
|     r  = courlan.check_url(link) |     r  = courlan.check_url(link) | ||||||
|     assert r is not None |     assert r is not None | ||||||
| @ -78,14 +93,37 @@ def fetch_pages(link_batch): | |||||||
|     for link in link_batch: |     for link in link_batch: | ||||||
|         print("fetching:::::") |         print("fetching:::::") | ||||||
|         print(link) |         print(link) | ||||||
|         htmls.append(trafilatura.fetch_url(link)) |         htmls.append(trafilatura.fetch_url(link,decode=False)) | ||||||
|     return htmls |     return htmls | ||||||
| 
 | 
 | ||||||
| def fetch_front_links(navigation_links): | def fetch_front_links(navigation_links): | ||||||
|     start_link = navigation_links[0] |     start_link = navigation_links[0] | ||||||
|     known_links = navigation_links[1:] |     known_links =navigation_links[1:] | ||||||
|     visit_links,known_links = trafilatura.spider.focused_crawler(start_link,known_links=known_links) |     rules = urllib.robotparser.RobotFileParser() | ||||||
|     filtered_links = filter_links(visit_links) |     rules.set_url(base_url + '/robots.txt') | ||||||
|  |     # exceptions happening here | ||||||
|  |     try: | ||||||
|  |         rules.read() | ||||||
|  |     except Exception as exc: | ||||||
|  |         LOGGER.error('cannot read robots.txt: %s', exc) | ||||||
|  |         rules = None | ||||||
|  |     response = trafilatura.fetch_url(homepage, decode=False) | ||||||
|  |     if response is None or response == '': | ||||||
|  |         return None, None, None | ||||||
|  |     # get redirected URL here? | ||||||
|  |     if response.url != homepage: | ||||||
|  |         logging.info('followed redirect: %s', response.url) | ||||||
|  |         homepage = response.url | ||||||
|  |     # decode response | ||||||
|  |     htmlstring = trafilatura.utils.decode_response(response.data) | ||||||
|  |     # is there a meta-refresh on the page? | ||||||
|  |     htmlstring, homepage = trafilatura.spider.refresh_detection(htmlstring, homepage) | ||||||
|  |     if homepage is None:  # malformed or malicious content | ||||||
|  |         return None, None, None | ||||||
|  |     visit_links = courlan.extract_links(htmlstring) | ||||||
|  |     visit_links,known_links = trafilatura.spider.focused_crawler(start_link,lang=LANGUAGE) | ||||||
|  |     print(visit_links,known_links) | ||||||
|  |     filtered_links = filter_links(visit_links,LANGUAGE,rules=rules) | ||||||
|     return filtered_links |     return filtered_links | ||||||
| 
 | 
 | ||||||
| def extract_pages(link_batch,responses): | def extract_pages(link_batch,responses): | ||||||
| @ -93,39 +131,54 @@ def extract_pages(link_batch,responses): | |||||||
|     for link,response in zip(link_batch,responses): |     for link,response in zip(link_batch,responses): | ||||||
|         doc = None |         doc = None | ||||||
|         assert link is not None |         assert link is not None | ||||||
|         html = trafilatura.util.decode_response(response.data)  |         html = None | ||||||
|  |         response_link = None | ||||||
|  |         if response is not None: | ||||||
|  |             # is reponse link good? | ||||||
|  |             # filter and normalize | ||||||
|  |             rl = list(filter_links([response.url])) | ||||||
|  |             if len(rl) == 1: | ||||||
|  |                 response_link = rl[0] | ||||||
|  |                 html = trafilatura.utils.decode_response(response)  | ||||||
|         if html is not None: |         if html is not None: | ||||||
|             doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language="sk") |             doc = trafilatura.bare_extraction(html,url=link,with_metadata=True,include_formatting=True,target_language=LANGUAGE) | ||||||
|         out.append((link,response.url,html,doc)) |         out.append((link,response_link,html,doc)) | ||||||
|     return out |     return out | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def index_pages(db,domain,extracted_pages): | def index_pages(db,domain,extracted_pages,rules): | ||||||
|     extracted_links = set() |     extracted_links = set() | ||||||
|     linkcol = db["links"] |     linkcol = db["links"] | ||||||
|     htmlcol = db["html"] |     htmlcol = db["html"] | ||||||
|     contentcol = db["content"] |     contentcol = db["content"] | ||||||
|     for link,html,doc in extracted_pages: |     for original_link,final_link,html,doc in extracted_pages: | ||||||
|         state = "good" |         state = "good" | ||||||
|         if html is None: |         if html is None: | ||||||
|             state = "html_error" |             state = "html_error" | ||||||
|         elif doc is None: |         elif doc is None: | ||||||
|             state = "content_error" |             state = "content_error" | ||||||
|         if html is not None: |         if html is not None: | ||||||
|             htmlcol.insert_one({"url":link,"html":html}) |             htmlcol.insert_one({"url":final_link,"html":html}) | ||||||
|         if doc is not None: |         if doc is not None: | ||||||
|             print(doc) |  | ||||||
|             checksums,sizes = calculate_checksums(doc["text"]) |             checksums,sizes = calculate_checksums(doc["text"]) | ||||||
|             doc["paragraph_checksums"] = checksums |             doc["paragraph_checksums"] = checksums | ||||||
|             doc["paragraph_sizes"] = sizes |             doc["paragraph_sizes"] = sizes | ||||||
|  |             # todo extract links | ||||||
|             if "links" in doc: |             if "links" in doc: | ||||||
|                 extracted_links.union(doc["links"]) |                 extracted_links.union(doc["links"]) | ||||||
|                 del doc["links"] |                 del doc["links"] | ||||||
|  |             print(doc) | ||||||
|             contentcol.insert_one(doc) |             contentcol.insert_one(doc) | ||||||
|         doc = get_link_doc(link,state) |             # todo extract links | ||||||
|         linkcol.replace_one({"url":link},doc,upsert=True) |         doc = get_link_doc(original_link,state) | ||||||
|         filtered_links = filter_links(extracted_links) |         linkcol.replace_one({"url":final_link},doc,upsert=True) | ||||||
|  |         if original_link != final_link: | ||||||
|  |             rl = list(filter_links([final_link],rules)) | ||||||
|  |             if len(rl) == 1: | ||||||
|  |                 doc = get_link_doc(rl[0],"redirect") | ||||||
|  |                 linkcol.replace_one({"url":rl[0]},doc,upsert=True) | ||||||
|  |         filtered_links = filter_links(extracted_links,rules) | ||||||
|         for llink in filtered_links: |         for llink in filtered_links: | ||||||
|             doc = get_link_doc(link,"backlink") |             doc = get_link_doc(link,"backlink") | ||||||
|             if courlan.is_external(link,domain): |             if courlan.is_external(link,domain): | ||||||
| @ -134,11 +187,13 @@ def index_pages(db,domain,extracted_pages): | |||||||
|                 doc["status"] = "navigation" |                 doc["status"] = "navigation" | ||||||
|             linkcol.insert_one(doc) |             linkcol.insert_one(doc) | ||||||
| 
 | 
 | ||||||
| def get_links(db,domain,status,batch_size=100): | def get_links(db,domain,status,batch_size=BATCHSIZE): | ||||||
|     linkcol = db["links"] |     linkcol = db["links"] | ||||||
|     res  = linkcol.find({"status":status,"hostname":domain},limit=batch_size) |     res  = linkcol.find({"status":status,"domain":domain},{"url":1},limit=batch_size) | ||||||
|  |     print(res,domain,status) | ||||||
|     front_links = [] |     front_links = [] | ||||||
|     for doc in res: |     for doc in res: | ||||||
|  |         print(doc) | ||||||
|         front_links.append(doc["url"]) |         front_links.append(doc["url"]) | ||||||
|     return filter_links(front_links) |     return filter_links(front_links) | ||||||
| 
 | 
 | ||||||
| @ -162,12 +217,13 @@ def simple_visit(start_link): | |||||||
|     myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/") |     myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/") | ||||||
|     db=myclient["crawler"] |     db=myclient["crawler"] | ||||||
|     navigation_links =[start_link] |     navigation_links =[start_link] | ||||||
|     navigation_links += get_links(db,"navigation",domain) |     navigation_links += get_links(db,domain,"navigation") | ||||||
|     new_front_links = fetch_front_links(navigation_links) |     print(navigation_links) | ||||||
|  |     #new_front_links = fetch_front_links(navigation_links) | ||||||
|     print("NEW FRONT LINKS") |     print("NEW FRONT LINKS") | ||||||
|     print(new_front_links) |     #print(new_front_links) | ||||||
|     index_front_links(db,new_front_links) |     #index_front_links(db,new_front_links) | ||||||
|     front_links = get_links(db,"frontlink",domain) |     front_links = get_links(db,domain,"frontlink") | ||||||
|     print("NEW VISIT LINKS") |     print("NEW VISIT LINKS") | ||||||
|     visit_links = front_links |     visit_links = front_links | ||||||
|     print(visit_links) |     print(visit_links) | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user