zz
This commit is contained in:
		
							parent
							
								
									a05a3372af
								
							
						
					
					
						commit
						0accadf633
					
				| @ -4,6 +4,7 @@ import trafilatura.feeds | |||||||
| import trafilatura.sitemaps | import trafilatura.sitemaps | ||||||
| import trafilatura.spider | import trafilatura.spider | ||||||
| import trafilatura.utils | import trafilatura.utils | ||||||
|  | import trafilatura.external | ||||||
| import sys | import sys | ||||||
| import courlan | import courlan | ||||||
| import urllib | import urllib | ||||||
| @ -11,6 +12,7 @@ from datetime import datetime | |||||||
| import click | import click | ||||||
| import logging as LOGGER | import logging as LOGGER | ||||||
| import os | import os | ||||||
|  | import pprint | ||||||
| 
 | 
 | ||||||
| LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") | LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") | ||||||
| DOMAIN = os.getenv("SUCKER_DOMAIN","sk") | DOMAIN = os.getenv("SUCKER_DOMAIN","sk") | ||||||
| @ -157,10 +159,13 @@ def extract_pages(link_batch,responses): | |||||||
|         assert original_link is not None |         assert original_link is not None | ||||||
|         if html is not None: |         if html is not None: | ||||||
|             doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE) |             doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE) | ||||||
|  |             print("html2doc") | ||||||
|  |             print(text) | ||||||
|             if doc is not None: |             if doc is not None: | ||||||
|                 if not "text" in doc or len(doc["text"]) < MINTEXTSIZE: |                 if not "text" in doc or len(doc["text"]) < MINTEXTSIZE: | ||||||
|                     # text too small |                     # text too small | ||||||
|                     doc = None |                     doc = None | ||||||
|  | 
 | ||||||
|         out.append((original_link,final_link,html,doc)) |         out.append((original_link,final_link,html,doc)) | ||||||
|     return out |     return out | ||||||
| 
 | 
 | ||||||
| @ -188,7 +193,7 @@ def index_pages(db,domain,extracted_pages): | |||||||
|             doc["paragraph_sizes"] = sizes |             doc["paragraph_sizes"] = sizes | ||||||
|             goodsz = sum(sizes) |             goodsz = sum(sizes) | ||||||
|             if len(text) < 200 or goodsz/len(text) < 0.4: |             if len(text) < 200 or goodsz/len(text) < 0.4: | ||||||
|                 state = "trash" |                 stat = "trash" | ||||||
|         if state == "good": |         if state == "good": | ||||||
|             htdoc = get_link_doc(link,state) |             htdoc = get_link_doc(link,state) | ||||||
|             htdoc["html"] = html |             htdoc["html"] = html | ||||||
| @ -306,6 +311,20 @@ def createdb(): | |||||||
|     htmlcol = db["html"] |     htmlcol = db["html"] | ||||||
|     htmlcol.create_index("url",unique=True) |     htmlcol.create_index("url",unique=True) | ||||||
| 
 | 
 | ||||||
|  | @cli.command() | ||||||
|  | @click.argument("start_link") | ||||||
|  | def parseurl(start_link): | ||||||
|  |     link,domain = courlan.check_url(start_link) | ||||||
|  |     links = [link] | ||||||
|  |     responses = fetch_pages(links) | ||||||
|  |     #pprint.pprint(responses) | ||||||
|  |     extracted_pages = extract_pages(links,responses) | ||||||
|  |     for ol,bl,html,doc in extracted_pages: | ||||||
|  |         pprint.pprint(doc) | ||||||
|  |     extracted_links = extract_links(links,responses,domain,None,"backlink") | ||||||
|  |     pprint.pprint(extracted_links) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @cli.command() | @cli.command() | ||||||
| @click.argument("start_link") | @click.argument("start_link") | ||||||
| def visit(start_link): | def visit(start_link): | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user