zz
This commit is contained in:
		
							parent
							
								
									a05a3372af
								
							
						
					
					
						commit
						0accadf633
					
				| @ -4,6 +4,7 @@ import trafilatura.feeds | ||||
| import trafilatura.sitemaps | ||||
| import trafilatura.spider | ||||
| import trafilatura.utils | ||||
| import trafilatura.external | ||||
| import sys | ||||
| import courlan | ||||
| import urllib | ||||
| @ -11,6 +12,7 @@ from datetime import datetime | ||||
| import click | ||||
| import logging as LOGGER | ||||
| import os | ||||
| import pprint | ||||
| 
 | ||||
| LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk") | ||||
| DOMAIN = os.getenv("SUCKER_DOMAIN","sk") | ||||
| @ -157,10 +159,13 @@ def extract_pages(link_batch,responses): | ||||
|         assert original_link is not None | ||||
|         if html is not None: | ||||
|             doc = trafilatura.bare_extraction(html,url=final_link,with_metadata=True,include_formatting=True,target_language=LANGUAGE) | ||||
|             print("html2doc") | ||||
|             print(text) | ||||
|             if doc is not None: | ||||
|                 if not "text" in doc or len(doc["text"]) < MINTEXTSIZE: | ||||
|                     # text too small | ||||
|                     doc = None | ||||
| 
 | ||||
|         out.append((original_link,final_link,html,doc)) | ||||
|     return out | ||||
| 
 | ||||
| @ -188,7 +193,7 @@ def index_pages(db,domain,extracted_pages): | ||||
|             doc["paragraph_sizes"] = sizes | ||||
|             goodsz = sum(sizes) | ||||
|             if len(text) < 200 or goodsz/len(text) < 0.4: | ||||
|                 state = "trash" | ||||
|                 stat = "trash" | ||||
|         if state == "good": | ||||
|             htdoc = get_link_doc(link,state) | ||||
|             htdoc["html"] = html | ||||
| @ -306,6 +311,20 @@ def createdb(): | ||||
|     htmlcol = db["html"] | ||||
|     htmlcol.create_index("url",unique=True) | ||||
| 
 | ||||
| @cli.command() | ||||
| @click.argument("start_link") | ||||
| def parseurl(start_link): | ||||
|     link,domain = courlan.check_url(start_link) | ||||
|     links = [link] | ||||
|     responses = fetch_pages(links) | ||||
|     #pprint.pprint(responses) | ||||
|     extracted_pages = extract_pages(links,responses) | ||||
|     for ol,bl,html,doc in extracted_pages: | ||||
|         pprint.pprint(doc) | ||||
|     extracted_links = extract_links(links,responses,domain,None,"backlink") | ||||
|     pprint.pprint(extracted_links) | ||||
| 
 | ||||
| 
 | ||||
| @cli.command() | ||||
| @click.argument("start_link") | ||||
| def visit(start_link): | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user