zz

Merge branch 'master' of git.kemt.fei.tuke.sk:dano/websucker-pip
zz
2024-03-21 17:31:48 +01:00 · 2024-03-21 17:01:42 +01:00 · 2024-03-21 17:01:39 +01:00 · 2024-03-06 18:44:12 +01:00 · 2024-03-06 18:42:17 +01:00
2 changed files with 21 additions and 8 deletions
--- a/mongo/cli.py
+++ b/mongo/cli.py
@ -2,6 +2,7 @@ import click
 import mongocrawler
 import rq
 import redis
 import json
 import sys
 import os
 import pymongo
@ -69,7 +70,8 @@ def sample(domain):
    myclient = pymongo.MongoClient(CONNECTION)
    db=myclient[DBNAME]
    links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
-    print(links)
+    for link in links:
        print(link)
@cli.command()
@click.argument("start_link")
@ -86,17 +88,18 @@ def fetchlinks(start_link):
@cli.command()
-@click.argument(hostname)
+@click.argument("hostname")
-def process_links():
+def processlinks(hostname):
    rules = mongocrawler.fetch_robot(hostname)
    outfile = "data.jsonl"
    links = []
    for line in sys.stdin:
        links.append(line.rstrip())
-    extracted_pages, extracted_links = fetch_and_extract(links,rules)
+    extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
-    for page in extracted_pages:
+    with open(outfile,"w") as of:
-        print(page)
+        for page in extracted_pages:
-    pass
+            doc = json.dumps(page)
            print(page,file=of)
@cli.command(help="Enqueue a list of links into redis queue for crawling")
--- a/mongo/mongocrawler.py
+++ b/mongo/mongocrawler.py
@ -530,7 +530,6 @@ def link_summary(db,hostname):
    print(res)
 def sample_links(db,hostname,status,batch_size):
    print("Sampling links")
    linkcol = db["links"]
    res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
    cl = LinkClassifier()
@ -663,6 +662,7 @@ def classify(start_link):
    cl.train(trainset)
    cl.test(testset)
 def index_pages(db,hostname,extracted_pages,filter_content):
    final_states = []
    docs = []
@ -707,6 +707,16 @@ def visit(hostname,filter_content=True):
    print("Processing links")
    extracted_pages, extracted_links = fetch_and_extract(links,rules)
    extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
    index_links(db,extracted_links)
    final_states = []
    docs = []
    for original_link,final_link,html,doc in extracted_pages:
        status = index_page(db,original_link,final_link,html,doc,filter_content)
        final_states.append(status)
        docs.append(doc)
    save_batch_info(db,hostname,final_states,docs)
    index_pages(db,hostname,extracted_pages,filter_content)
    index_links(db, extracted_links)
    link_summary(db,hostname)
Author	SHA1	Message	Date
Daniel Hladek	5d45569651	zz	2024-03-21 17:31:48 +01:00
Daniel Hladek	ed1d4701b8	Merge branch 'master' of git.kemt.fei.tuke.sk:dano/websucker-pip	2024-03-21 17:01:42 +01:00
Daniel Hladek	bd32d32557	zz	2024-03-21 17:01:39 +01:00
Daniel Hladek	3bdac3642b	zz	2024-03-06 18:44:12 +01:00
Daniel Hladek	87f84b8eb8	zz	2024-03-06 18:42:17 +01:00