import os import httpx import trafilatura from fastapi import FastAPI, HTTPException from pydantic import BaseModel import asyncpg import asyncio from contextlib import asynccontextmanager DB_DSN = "postgresql://{user}:{password}@{host}:{port}/{dbname}".format( user=os.environ["DB_USER"], password=os.environ["DB_PASSWORD"], host=os.environ["DB_HOST"], port=os.environ.get("DB_PORT", "5432"), dbname=os.environ["DB_NAME"], ) SUMMARIZER_URL = os.environ.get("SUMMARIZER_URL", "http://summarizer:8000") pool = None @asynccontextmanager async def lifespan(app: FastAPI): global pool pool = await asyncpg.create_pool(DB_DSN) await pool.execute(""" CREATE TABLE IF NOT EXISTS articles ( id SERIAL PRIMARY KEY, url TEXT NOT NULL, title TEXT DEFAULT '', full_text TEXT DEFAULT '', summary TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT NOW() ) """) yield await pool.close() app = FastAPI(lifespan=lifespan) class URLRequest(BaseModel): url: str @app.get("/api/health") async def health(): return {"status": "ok"} @app.get("/api/articles") async def list_articles(): rows = await pool.fetch( "SELECT id, url, title, summary, created_at " "FROM articles ORDER BY created_at DESC LIMIT 50" ) return [dict(r) for r in rows] @app.post("/api/articles") async def create_article(req: URLRequest): downloaded = await asyncio.to_thread(trafilatura.fetch_url, req.url) if not downloaded: raise HTTPException(400, "Failed to fetch URL") result = await asyncio.to_thread(trafilatura.bare_extraction, downloaded, include_comments=False) if not result or not result.get("text"): raise HTTPException(400, "Failed to extract text") text = result["text"] title = result.get("title", "") async with httpx.AsyncClient(timeout=120.0) as client: resp = await client.post( f"{SUMMARIZER_URL}/summarize", json={"text": text} ) if resp.status_code != 200: raise HTTPException(502, "Summarizer service unavailable") summary = resp.json()["summary"] row = await pool.fetchrow( "INSERT INTO articles (url, title, full_text, summary) " "VALUES ($1, $2, $3, $4) " "RETURNING id, url, title, summary, created_at", req.url, title, text, summary, ) return dict(row)