zz

Merge branch 'master' of git.kemt.fei.tuke.sk:dano/websucker-pip
zz
2024-03-21 17:31:48 +01:00 · 2024-03-21 17:01:42 +01:00 · 2024-03-21 17:01:39 +01:00 · 2024-03-06 18:44:12 +01:00 · 2024-03-06 18:42:17 +01:00
2 changed files with 21 additions and 8 deletions
--- a/mongo/cli.py
+++ b/mongo/cli.py
@ -2,6 +2,7 @@ import click
 import mongocrawler
 import rq
 import redis
+import json
 import sys
 import os
 import pymongo
@ -69,7 +70,8 @@ def sample(domain):
    myclient = pymongo.MongoClient(CONNECTION)
    db=myclient[DBNAME]
    links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
-    print(links)
+    for link in links:
+        print(link)

@cli.command()
@click.argument("start_link")
@ -86,17 +88,18 @@ def fetchlinks(start_link):


@cli.command()
-@click.argument(hostname)
-def process_links():
+@click.argument("hostname")
+def processlinks(hostname):
    rules = mongocrawler.fetch_robot(hostname)
    outfile = "data.jsonl"
    links = []
    for line in sys.stdin:
        links.append(line.rstrip())
-    extracted_pages, extracted_links = fetch_and_extract(links,rules)
-    for page in extracted_pages:
-        print(page)
-    pass
+    extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
+    with open(outfile,"w") as of:
+        for page in extracted_pages:
+            doc = json.dumps(page)
+            print(page,file=of)


@cli.command(help="Enqueue a list of links into redis queue for crawling")
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@ -530,7 +530,6 @@ def link_summary(db,hostname):
    print(res)

 def sample_links(db,hostname,status,batch_size):
-    print("Sampling links")
    linkcol = db["links"]
    res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
    cl = LinkClassifier()
@ -663,6 +662,7 @@ def classify(start_link):
    cl.train(trainset)
    cl.test(testset)

+
 def index_pages(db,hostname,extracted_pages,filter_content):
    final_states = []
    docs = []
@ -707,6 +707,16 @@ def visit(hostname,filter_content=True):
    print("Processing links")
    extracted_pages, extracted_links = fetch_and_extract(links,rules)

+    extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
+
+    index_links(db,extracted_links)
+    final_states = []
+    docs = []
+    for original_link,final_link,html,doc in extracted_pages:
+        status = index_page(db,original_link,final_link,html,doc,filter_content)
+        final_states.append(status)
+        docs.append(doc)
+    save_batch_info(db,hostname,final_states,docs)
    index_pages(db,hostname,extracted_pages,filter_content)
    index_links(db, extracted_links)
    link_summary(db,hostname)
Author	SHA1	Message	Date
Daniel Hladek	5d45569651	zz	2024-03-21 17:31:48 +01:00
Daniel Hladek	ed1d4701b8	Merge branch 'master' of git.kemt.fei.tuke.sk:dano/websucker-pip	2024-03-21 17:01:42 +01:00
Daniel Hladek	bd32d32557	zz	2024-03-21 17:01:39 +01:00
Daniel Hladek	3bdac3642b	zz	2024-03-06 18:44:12 +01:00
Daniel Hladek	87f84b8eb8	zz	2024-03-06 18:42:17 +01:00