Compare commits

..

5 Commits

Author SHA1 Message Date
5d45569651 zz 2024-03-21 17:31:48 +01:00
ed1d4701b8 Merge branch 'master' of git.kemt.fei.tuke.sk:dano/websucker-pip 2024-03-21 17:01:42 +01:00
bd32d32557 zz 2024-03-21 17:01:39 +01:00
3bdac3642b zz 2024-03-06 18:44:12 +01:00
87f84b8eb8 zz 2024-03-06 18:42:17 +01:00
2 changed files with 21 additions and 8 deletions

View File

@ -2,6 +2,7 @@ import click
import mongocrawler import mongocrawler
import rq import rq
import redis import redis
import json
import sys import sys
import os import os
import pymongo import pymongo
@ -69,7 +70,8 @@ def sample(domain):
myclient = pymongo.MongoClient(CONNECTION) myclient = pymongo.MongoClient(CONNECTION)
db=myclient[DBNAME] db=myclient[DBNAME]
links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE) links = mongocrawler.sample_links(db,domain,"frontlink",BATCH_SIZE)
print(links) for link in links:
print(link)
@cli.command() @cli.command()
@click.argument("start_link") @click.argument("start_link")
@ -86,17 +88,18 @@ def fetchlinks(start_link):
@cli.command() @cli.command()
@click.argument(hostname) @click.argument("hostname")
def process_links(): def processlinks(hostname):
rules = mongocrawler.fetch_robot(hostname) rules = mongocrawler.fetch_robot(hostname)
outfile = "data.jsonl" outfile = "data.jsonl"
links = [] links = []
for line in sys.stdin: for line in sys.stdin:
links.append(line.rstrip()) links.append(line.rstrip())
extracted_pages, extracted_links = fetch_and_extract(links,rules) extracted_pages, extracted_links = mongocrawler.fetch_and_extract(links,rules)
for page in extracted_pages: with open(outfile,"w") as of:
print(page) for page in extracted_pages:
pass doc = json.dumps(page)
print(page,file=of)
@cli.command(help="Enqueue a list of links into redis queue for crawling") @cli.command(help="Enqueue a list of links into redis queue for crawling")

View File

@ -530,7 +530,6 @@ def link_summary(db,hostname):
print(res) print(res)
def sample_links(db,hostname,status,batch_size): def sample_links(db,hostname,status,batch_size):
print("Sampling links")
linkcol = db["links"] linkcol = db["links"]
res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}}) res = linkcol.find({"host":hostname,"status": {"$not":{"$in":["frontlink"]}}})
cl = LinkClassifier() cl = LinkClassifier()
@ -663,6 +662,7 @@ def classify(start_link):
cl.train(trainset) cl.train(trainset)
cl.test(testset) cl.test(testset)
def index_pages(db,hostname,extracted_pages,filter_content): def index_pages(db,hostname,extracted_pages,filter_content):
final_states = [] final_states = []
docs = [] docs = []
@ -707,6 +707,16 @@ def visit(hostname,filter_content=True):
print("Processing links") print("Processing links")
extracted_pages, extracted_links = fetch_and_extract(links,rules) extracted_pages, extracted_links = fetch_and_extract(links,rules)
extracted_links = extract_links(links,responses,hostname,rules,"frontlink")
index_links(db,extracted_links)
final_states = []
docs = []
for original_link,final_link,html,doc in extracted_pages:
status = index_page(db,original_link,final_link,html,doc,filter_content)
final_states.append(status)
docs.append(doc)
save_batch_info(db,hostname,final_states,docs)
index_pages(db,hostname,extracted_pages,filter_content) index_pages(db,hostname,extracted_pages,filter_content)
index_links(db, extracted_links) index_links(db, extracted_links)
link_summary(db,hostname) link_summary(db,hostname)