zz

2024-03-21 17:31:48 +01:00 · 2024-03-21 17:31:48 +01:00 · 5d45569651
commit 5d45569651
parent ed1d4701b8
2 changed files with 10 additions and 8 deletions
--- a/mongo/cli.py
+++ b/mongo/cli.py
@ -2,6 +2,7 @@ import click
 import mongocrawler
 import rq
 import redis
+import json
 import sys
 import os
 import pymongo
@ -69,7 +70,8 @@ def sample(domain):
    myclient = pymongo.MongoClient(CONNECTION)
    db=myclient[DBNAME]
    links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
-    print(links)
+    for link in links:
+        print(link)

@cli.command()
@click.argument("start_link")
@ -86,17 +88,18 @@ def fetchlinks(start_link):


@cli.command()
-@click.argument(hostname)
-def process_links():
+@click.argument("hostname")
+def processlinks(hostname):
    rules = mongocrawler.fetch_robot(hostname)
    outfile = "data.jsonl"
    links = []
    for line in sys.stdin:
        links.append(line.rstrip())
-    extracted_pages, extracted_links = fetch_and_extract(links,rules)
-    for page in extracted_pages:
-        print(page)
-    pass
+    extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
+    with open(outfile,"w") as of:
+        for page in extracted_pages:
+            doc = json.dumps(page)
+            print(page,file=of)


@cli.command(help="Enqueue a list of links into redis queue for crawling")
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@ -530,7 +530,6 @@ def link_summary(db,hostname):
    print(res)

 def sample_links(db,hostname,status,batch_size):
-    print("Sampling links")
    linkcol = db["links"]
    res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
    cl = LinkClassifier()