zz

2023-04-13 16:37:32 +02:00 · 2023-04-13 16:37:32 +02:00 · 4a42078bef
commit 4a42078bef
parent 44dc4be8c3
2 changed files with 15 additions and 4 deletions
--- a/mongo/cli.py
+++ b/mongo/cli.py
@ -47,11 +47,16 @@ def sampledomains():

@cli.command(help="Enqueue a list of links into redis queue for crawling")
 def enqueue():
+    # TODO: select queues
    q = rq.Queue(connection=redis.from_url(REDIS_URL))
    for l in sys.stdin:
        print(l.strip())
        r = q.enqueue(mongocrawler.visit, l.strip())
        print(r)

+@cli.command()
+def importhtml():
+    mongocrawler.import_html()
+
 if __name__ == "__main__":
    cli()
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@ -250,6 +250,7 @@ def index_page(db,original_link,final_link,html,doc):
        origsz = 0
        for chs,paragraph_size in zip(doc["paragraph_checksums"],doc["paragraph_sizes"]):
            # index paragraph checksums
+            print(checkcol)
            nd = checkcol.find_one({"_id":chs})
            if nd is None:
                origsz += paragraph_size
@ -736,15 +737,20 @@ def crawl_summary():
        print("\t".join(values))

 import binascii
+import json

 def import_html():
-    myclient = pymongo.MongoClient(CONNECTION)
+    myclient= pymongo.MongoClient(CONNECTION)
+    db=myclient[DBNAME]
    for l in sys.stdin:
        hdoc = json.loads(l)
        url = hdoc["url"]
-        html = bs4.BeautifulSoup(binascii.b2a_qp(hdoc["quoted_html"])).prettify()
-        doc = extract_pages(url,html)
-        index_page(db,url,url,html,doc)
+        html = BeautifulSoup(binascii.a2b_qp(hdoc["quoted_html"])).prettify()
+        doc = extract_page(url,html)
+        if doc is not None:
+            print(doc)
+            status = index_page(db,url,url,html,doc)
+            print(status)

 def sample_domains():
    myclient = pymongo.MongoClient(CONNECTION)