zz
This commit is contained in:
parent
3993a61899
commit
44fbf6b755
@ -9,10 +9,14 @@ import courlan
|
||||
import urllib
|
||||
from datetime import datetime
|
||||
import click
|
||||
import logging as LOGGER
|
||||
import os
|
||||
|
||||
LANGUAGE="sk"
|
||||
DOMAIN = "sk"
|
||||
BATCHSIZE=10
|
||||
LANGUAGE= os.getenv("SUCKER_LANGUAGE","sk")
|
||||
DOMAIN = os.getenv("SUCKER_DOMAIN","sk")
|
||||
BATCHSIZE=os.getenv("SUCKER_BATCHSIZE",10)
|
||||
CONNECTION=os.getenv("SUCKER_CONNECTION","mongodb://root:example@localhost:27017/")
|
||||
DBNAME=os.getenv("SUCKER_DBNAME","crawler")
|
||||
MINFILESIZE=300
|
||||
MAXFILESIZE=10000000
|
||||
MINTEXTSIZE=200
|
||||
@ -48,7 +52,7 @@ def calculate_checksums(text):
|
||||
|
||||
def is_robot_good(link,rules):
|
||||
# check robots.txt rules
|
||||
if rules is not None and not rules.can_fetch("*", llink):
|
||||
if rules is not None and not rules.can_fetch("*", link):
|
||||
return False
|
||||
return True
|
||||
|
||||
@ -61,10 +65,10 @@ def is_link_good(link):
|
||||
print(llink,ldomain)
|
||||
# domain rules
|
||||
if not ldomain.endswith(DOMAIN):
|
||||
print("bad domain")
|
||||
LOGGER.debug("bad domain")
|
||||
return None
|
||||
if courlan.is_not_crawlable(llink):
|
||||
print("not crawlable")
|
||||
LOGGER.debug("not crawlable")
|
||||
return None
|
||||
return llink
|
||||
|
||||
@ -112,14 +116,14 @@ def fetch_pages(link_batch):
|
||||
good = True
|
||||
if response.status != 200:
|
||||
good = False
|
||||
#LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
||||
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
|
||||
elif response.data is None or len(response.data) < MINFILESIZE:
|
||||
#LOGGER.error('too small/incorrect for URL %s', url)
|
||||
LOGGER.error('too small/incorrect for URL %s', url)
|
||||
good = False
|
||||
# raise error instead?
|
||||
elif len(response.data) > MAXFILESIZE:
|
||||
good = False
|
||||
#LOGGER.error('too large: length %s for URL %s', len(response.data), url)
|
||||
LOGGER.error('too large: length %s for URL %s', len(response.data), url)
|
||||
if good:
|
||||
html = trafilatura.utils.decode_response(response)
|
||||
final_link = response.url
|
||||
@ -133,12 +137,12 @@ def fetch_pages(link_batch):
|
||||
|
||||
def fetch_robot(base_url):
|
||||
rules = urllib.robotparser.RobotFileParser()
|
||||
rules.set_url(base_url + '/robots.txt')
|
||||
rules.set_url("https://" + base_url + '/robots.txt')
|
||||
# exceptions happening here
|
||||
try:
|
||||
rules.read()
|
||||
except Exception as exc:
|
||||
#LOGGER.error('cannot read robots.txt: %s', exc)
|
||||
LOGGER.error('cannot read robots.txt: %s', exc)
|
||||
rules = None
|
||||
return rules
|
||||
|
||||
@ -166,25 +170,31 @@ def index_pages(db,domain,extracted_pages):
|
||||
links = []
|
||||
for original_link,final_link,html,doc in extracted_pages:
|
||||
state = "good"
|
||||
link = original_link
|
||||
if original_link != final_link:
|
||||
linkcol.insert_one(get_link_doc(original_link,"redirect"))
|
||||
link = final_link
|
||||
if html is None:
|
||||
state = "html_error"
|
||||
elif doc is None:
|
||||
state = "content_error"
|
||||
if original_link != final_link:
|
||||
linkcol.insert_one(get_link_doc(final_link,state))
|
||||
state = "redirect"
|
||||
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
|
||||
if doc is not None:
|
||||
if html is not None:
|
||||
htmlcol.insert_one({"url":final_link,"html":html,"html_size":len(html),"created_at":datetime.utcnow()})
|
||||
checksums,sizes = calculate_checksums(doc["text"])
|
||||
doc["created_at"] = datetime.utcnow()
|
||||
doc["text_size"] = len(doc["text"])
|
||||
doc["paragraph_checksums"] = checksums
|
||||
doc["paragraph_sizes"] = sizes
|
||||
if len(checksums) < 1:
|
||||
state = "trash"
|
||||
if state == "good":
|
||||
htdoc = get_link_doc(link,state)
|
||||
htdoc["html"] = html
|
||||
htdoc["html_size"] = len(html)
|
||||
htmlcol.insert_one(htdoc)
|
||||
doc.update(get_link_doc(link,"good"))
|
||||
# todo extract links
|
||||
print(doc)
|
||||
contentcol.insert_one(doc)
|
||||
linkcol.update_one({"url":original_link},{"$set":{"status":state}})
|
||||
|
||||
|
||||
def extract_links(link_batch,responses,domain,rules,default_status="frontlink"):
|
||||
@ -245,14 +255,31 @@ def link_summary(db,domain):
|
||||
linkcol = db["links"]
|
||||
#res = linkcol.distinct("domain",{"hostname":domain})
|
||||
|
||||
# count links
|
||||
res = linkcol.aggregate([
|
||||
{"$match":{"host":domain}},
|
||||
{"$group":{"_id":"$status","count":{"$sum":1}}},
|
||||
])
|
||||
for item in res:
|
||||
print(item)
|
||||
contentcol = db["content"]
|
||||
res = contentcol.aggregate([
|
||||
{"$match":{"hostname":domain}},
|
||||
{"$group":{"_id":None,"text_size_sum":{"$sum":"text_size"}}},
|
||||
])
|
||||
for item in res:
|
||||
print(item)
|
||||
|
||||
def create_indices(db):
|
||||
global DB
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
pass
|
||||
|
||||
@cli.command()
|
||||
def dropdb():
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
linkcol = db["links"]
|
||||
linkcol.create_index({"url":1},{"name":"url"})
|
||||
linkcol.create_index({"host":1,"status":1},{"name":"hostname_status"})
|
||||
@ -263,17 +290,14 @@ def create_indices(db):
|
||||
htmlcol = db["html"]
|
||||
htmlcol.create_index({"url":1})
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
pass
|
||||
|
||||
@click.command()
|
||||
@cli.command()
|
||||
@click.argument("start_link")
|
||||
def simple_visit(start_link):
|
||||
def visit(start_link):
|
||||
myclient = pymongo.MongoClient(CONNECTION)
|
||||
db=myclient[DBNAME]
|
||||
start_link,domain = courlan.check_url(start_link)
|
||||
myclient = pymongo.MongoClient("mongodb://root:example@localhost:27017/")
|
||||
db=myclient["crawler"]
|
||||
rules = fetch_robot(domain)
|
||||
print(rules)
|
||||
batch_size = BATCHSIZE
|
||||
navigation_links = get_links(db,domain,"navigation",batch_size)
|
||||
if start_link is not None:
|
||||
|
Loading…
Reference in New Issue
Block a user