Compare commits
2 Commits
123191b0f4
...
75840f6d21
Author | SHA1 | Date | |
---|---|---|---|
75840f6d21 | |||
0d477b1ab3 |
@ -93,15 +93,6 @@ def get_link_doc(link,status="frontlink"):
|
|||||||
domain = courlan.extract_domain(link)
|
domain = courlan.extract_domain(link)
|
||||||
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()}
|
return {"url":link,"host":host,"domain":domain,"status":status,"created_at":datetime.utcnow()}
|
||||||
|
|
||||||
def generic_visit(hostname):
|
|
||||||
known_links = set(get_visited_links(hostname))
|
|
||||||
visit_links = []
|
|
||||||
visit_links = trafilatura.find_feed_urls(hostname)
|
|
||||||
if visit_links is None:
|
|
||||||
visit_links = trafilatura.sitemap_search(hostname)
|
|
||||||
if visit_links is None:
|
|
||||||
visit_links = trafilatura.focused_crawler(dommain,known_links=known_links)
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_pages(link_batch):
|
def fetch_pages(link_batch):
|
||||||
htmls = []
|
htmls = []
|
||||||
@ -119,12 +110,12 @@ def fetch_pages(link_batch):
|
|||||||
good = False
|
good = False
|
||||||
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
||||||
elif response.data is None or len(response.data) < MINFILESIZE:
|
elif response.data is None or len(response.data) < MINFILESIZE:
|
||||||
LOGGER.error('too small/incorrect for URL %s', url)
|
LOGGER.error('too small/incorrect for URL %s', link)
|
||||||
good = False
|
good = False
|
||||||
# raise error instead?
|
# raise error instead?
|
||||||
elif len(response.data) > MAXFILESIZE:
|
elif len(response.data) > MAXFILESIZE:
|
||||||
good = False
|
good = False
|
||||||
LOGGER.error('too large: length %s for URL %s', len(response.data), url)
|
LOGGER.error('too large: length %s for URL %s', len(response.data), link)
|
||||||
if good:
|
if good:
|
||||||
html = trafilatura.utils.decode_response(response)
|
html = trafilatura.utils.decode_response(response)
|
||||||
final_link = response.url
|
final_link = response.url
|
||||||
@ -247,7 +238,6 @@ def index_links(db,extracted_links):
|
|||||||
except pymongo.errors.DuplicateKeyError as ex:
|
except pymongo.errors.DuplicateKeyError as ex:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def get_links(db,hostname,status,batch_size=BATCHSIZE):
|
def get_links(db,hostname,status,batch_size=BATCHSIZE):
|
||||||
linkcol = db["links"]
|
linkcol = db["links"]
|
||||||
res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
|
res = linkcol.find({"status":status,"host":hostname},{"url":1},limit=batch_size)
|
||||||
@ -310,6 +300,29 @@ def createdb():
|
|||||||
htmlcol = db["html"]
|
htmlcol = db["html"]
|
||||||
htmlcol.create_index("url",unique=True)
|
htmlcol.create_index("url",unique=True)
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument("link")
|
||||||
|
def parseurl(link):
|
||||||
|
html = trafilatura.fetch_url(link,decode=True)
|
||||||
|
doc = trafilatura.bare_extraction(html)
|
||||||
|
import pprint
|
||||||
|
pprint.pprint(doc)
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument("link")
|
||||||
|
def externaldomains(link):
|
||||||
|
html = trafilatura.fetch_url(link,decode=True)
|
||||||
|
external_links = courlan.extract_links(html,link,external_bool=True,language=LANGUAGE)
|
||||||
|
domains = set()
|
||||||
|
for l in external_links:
|
||||||
|
r = courlan.check_url(l)
|
||||||
|
if r is None:
|
||||||
|
pass
|
||||||
|
link,domain = r
|
||||||
|
domains.add(domain)
|
||||||
|
for d in domains:
|
||||||
|
print(d)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument("start_link")
|
@click.argument("start_link")
|
||||||
def parseurl(start_link):
|
def parseurl(start_link):
|
||||||
|
Loading…
Reference in New Issue
Block a user