zz
This commit is contained in:
parent
44dc4be8c3
commit
4a42078bef
@ -47,11 +47,16 @@ def sampledomains():
|
|||||||
|
|
||||||
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
@cli.command(help="Enqueue a list of links into redis queue for crawling")
|
||||||
def enqueue():
|
def enqueue():
|
||||||
|
# TODO: select queues
|
||||||
q = rq.Queue(connection=redis.from_url(REDIS_URL))
|
q = rq.Queue(connection=redis.from_url(REDIS_URL))
|
||||||
for l in sys.stdin:
|
for l in sys.stdin:
|
||||||
print(l.strip())
|
print(l.strip())
|
||||||
r = q.enqueue(mongocrawler.visit, l.strip())
|
r = q.enqueue(mongocrawler.visit, l.strip())
|
||||||
print(r)
|
print(r)
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
def importhtml():
|
||||||
|
mongocrawler.import_html()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cli()
|
cli()
|
||||||
|
@ -250,6 +250,7 @@ def index_page(db,original_link,final_link,html,doc):
|
|||||||
origsz = 0
|
origsz = 0
|
||||||
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
|
||||||
# index paragraph checksums
|
# index paragraph checksums
|
||||||
|
print(checkcol)
|
||||||
nd = checkcol.find_one({"_id":chs})
|
nd = checkcol.find_one({"_id":chs})
|
||||||
if nd is None:
|
if nd is None:
|
||||||
origsz += paragraph_size
|
origsz += paragraph_size
|
||||||
@ -736,15 +737,20 @@ def crawl_summary():
|
|||||||
print("\t".join(values))
|
print("\t".join(values))
|
||||||
|
|
||||||
import binascii
|
import binascii
|
||||||
|
import json
|
||||||
|
|
||||||
def import_html():
|
def import_html():
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient= pymongo.MongoClient(CONNECTION)
|
||||||
|
db=myclient[DBNAME]
|
||||||
for l in sys.stdin:
|
for l in sys.stdin:
|
||||||
hdoc = json.loads(l)
|
hdoc = json.loads(l)
|
||||||
url = hdoc["url"]
|
url = hdoc["url"]
|
||||||
html = bs4.BeautifulSoup(binascii.b2a_qp(hdoc["quoted_html"])).prettify()
|
html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
|
||||||
doc = extract_pages(url,html)
|
doc = extract_page(url,html)
|
||||||
index_page(db,url,url,html,doc)
|
if doc is not None:
|
||||||
|
print(doc)
|
||||||
|
status = index_page(db,url,url,html,doc)
|
||||||
|
print(status)
|
||||||
|
|
||||||
def sample_domains():
|
def sample_domains():
|
||||||
myclient = pymongo.MongoClient(CONNECTION)
|
myclient = pymongo.MongoClient(CONNECTION)
|
||||||
|
Loading…
Reference in New Issue
Block a user