diff --git a/mongo/cli.py b/mongo/cli.py index 89538ef..52089ad 100644 --- a/mongo/cli.py +++ b/mongo/cli.py @@ -47,11 +47,16 @@ def sampledomains(): @cli.command(help="Enqueue a list of links into redis queue for crawling") def enqueue(): + # TODO: select queues q = rq.Queue(connection=redis.from_url(REDIS_URL)) for l in sys.stdin: print(l.strip()) r = q.enqueue(mongocrawler.visit, l.strip()) print(r) +@cli.command() +def importhtml(): + mongocrawler.import_html() + if __name__ == "__main__": cli() diff --git a/mongo/mongocrawler.py b/mongo/mongocrawler.py index 29f6d20..e05bee8 100644 --- a/mongo/mongocrawler.py +++ b/mongo/mongocrawler.py @@ -250,6 +250,7 @@ def index_page(db,original_link,final_link,html,doc): origsz = 0 for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]): # index paragraph checksums + print(checkcol) nd = checkcol.find_one({"_id":chs}) if nd is None: origsz += paragraph_size @@ -736,15 +737,20 @@ def crawl_summary(): print("\t".join(values)) import binascii +import json def import_html(): - myclient = pymongo.MongoClient(CONNECTION) + myclient= pymongo.MongoClient(CONNECTION) + db=myclient[DBNAME] for l in sys.stdin: hdoc = json.loads(l) url = hdoc["url"] - html = bs4.BeautifulSoup(binascii.b2a_qp(hdoc["quoted_html"])).prettify() - doc = extract_pages(url,html) - index_page(db,url,url,html,doc) + html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify() + doc = extract_page(url,html) + if doc is not None: + print(doc) + status = index_page(db,url,url,html,doc) + print(status) def sample_domains(): myclient = pymongo.MongoClient(CONNECTION)