backend

2026-03-02 14:32:42 -03:00
parent 022da71e6a
commit bf03d333f9
29 changed files with 200764 additions and 200011 deletions
--- a/backend/app/init.py
+++ b/backend/app/init.py
@@ -0,0 +1 @@
+
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -0,0 +1,247 @@
+from __future__ import annotations
+
+from contextlib import asynccontextmanager
+
+from fastapi import FastAPI, HTTPException, Query
+from fastapi.middleware.cors import CORSMiddleware
+
+from .models import EdgesResponse, GraphResponse, NodesResponse, SparqlQueryRequest, StatsResponse
+from .rdf_store import RDFStore
+from .sparql_engine import AnzoGraphEngine, RdflibEngine, SparqlEngine, create_sparql_engine
+from .settings import Settings
+
+
+settings = Settings()
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    sparql: SparqlEngine = create_sparql_engine(settings)
+    await sparql.startup()
+    app.state.sparql = sparql
+
+    # Only build node/edge tables when running in rdflib mode.
+    if settings.graph_backend == "rdflib":
+        assert isinstance(sparql, RdflibEngine)
+        if sparql.graph is None:
+            raise RuntimeError("rdflib graph failed to load")
+
+        store = RDFStore(
+            ttl_path=settings.ttl_path,
+            include_bnodes=settings.include_bnodes,
+            max_triples=settings.max_triples,
+        )
+        store.load(sparql.graph)
+        app.state.store = store
+
+    yield
+
+    await sparql.shutdown()
+
+
+app = FastAPI(title="visualizador_instanciados backend", lifespan=lifespan)
+
+cors_origins = settings.cors_origin_list()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=cors_origins,
+    allow_credentials=False,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+@app.get("/api/health")
+def health() -> dict[str, str]:
+    return {"status": "ok"}
+
+
+@app.get("/api/stats", response_model=StatsResponse)
+async def stats() -> StatsResponse:
+    sparql: SparqlEngine = app.state.sparql
+
+    if settings.graph_backend == "rdflib":
+        store: RDFStore = app.state.store
+        return StatsResponse(
+            backend=sparql.name,
+            ttl_path=settings.ttl_path,
+            sparql_endpoint=None,
+            parsed_triples=store.parsed_triples,
+            nodes=store.node_count,
+            edges=store.edge_count,
+        )
+
+    # AnzoGraph: compute basic counts via SPARQL.
+    assert isinstance(sparql, AnzoGraphEngine)
+
+    def _count_from(result: dict, *, var: str = "count") -> int:
+        bindings = (((result.get("results") or {}).get("bindings")) or [])
+        if not bindings:
+            return 0
+        raw = bindings[0].get(var, {}).get("value")
+        try:
+            return int(raw)
+        except Exception:
+            return 0
+
+    bnode_filter = "" if settings.include_bnodes else "FILTER(!isBlank(?n))"
+    nodes_q = f"""
+SELECT (COUNT(DISTINCT ?n) AS ?count)
+WHERE {{
+  {{ ?n ?p ?o }} UNION {{ ?s ?p ?n }}
+  FILTER(!isLiteral(?n))
+  {bnode_filter}
+}}
+"""
+    triples_q = "SELECT (COUNT(*) AS ?count) WHERE { ?s ?p ?o }"
+
+    # Approximate "edges" similarly to our rdflib export: non-literal object, and skip label predicates.
+    edges_bnode_filter = "" if settings.include_bnodes else "FILTER(!isBlank(?s) && !isBlank(?o))"
+    edges_q = f"""
+SELECT (COUNT(*) AS ?count)
+WHERE {{
+  ?s ?p ?o .
+  FILTER(!isLiteral(?o))
+  FILTER(?p NOT IN (
+    <http://www.w3.org/2000/01/rdf-schema#label>,
+    <http://www.w3.org/2004/02/skos/core#prefLabel>,
+    <http://www.w3.org/2004/02/skos/core#altLabel>
+  ))
+  {edges_bnode_filter}
+}}
+"""
+
+    triples_res = await sparql.query_json(triples_q)
+    nodes_res = await sparql.query_json(nodes_q)
+    edges_res = await sparql.query_json(edges_q)
+
+    return StatsResponse(
+        backend=sparql.name,
+        ttl_path=settings.ttl_path,
+        sparql_endpoint=settings.effective_sparql_endpoint(),
+        parsed_triples=_count_from(triples_res),
+        nodes=_count_from(nodes_res),
+        edges=_count_from(edges_res),
+    )
+
+
+@app.post("/api/sparql")
+async def sparql_query(req: SparqlQueryRequest) -> dict:
+    sparql: SparqlEngine = app.state.sparql
+    data = await sparql.query_json(req.query)
+    return data
+
+
+@app.get("/api/nodes", response_model=NodesResponse)
+def nodes(
+    limit: int = Query(default=10_000, ge=1, le=200_000),
+    offset: int = Query(default=0, ge=0),
+) -> NodesResponse:
+    if settings.graph_backend != "rdflib":
+        raise HTTPException(status_code=501, detail="GET /api/nodes is only supported in GRAPH_BACKEND=rdflib mode")
+    store: RDFStore = app.state.store
+    return NodesResponse(total=store.node_count, nodes=store.node_slice(offset=offset, limit=limit))
+
+
+@app.get("/api/edges", response_model=EdgesResponse)
+def edges(
+    limit: int = Query(default=50_000, ge=1, le=500_000),
+    offset: int = Query(default=0, ge=0),
+) -> EdgesResponse:
+    if settings.graph_backend != "rdflib":
+        raise HTTPException(status_code=501, detail="GET /api/edges is only supported in GRAPH_BACKEND=rdflib mode")
+    store: RDFStore = app.state.store
+    return EdgesResponse(total=store.edge_count, edges=store.edge_slice(offset=offset, limit=limit))
+
+
+@app.get("/api/graph", response_model=GraphResponse)
+async def graph(
+    node_limit: int = Query(default=50_000, ge=1, le=200_000),
+    edge_limit: int = Query(default=100_000, ge=1, le=500_000),
+) -> GraphResponse:
+    sparql: SparqlEngine = app.state.sparql
+
+    if settings.graph_backend == "rdflib":
+        store: RDFStore = app.state.store
+        return GraphResponse(
+            nodes=store.node_slice(offset=0, limit=node_limit),
+            edges=store.edge_slice(offset=0, limit=edge_limit),
+        )
+
+    # AnzoGraph mode: return a simple subgraph by pulling the first N triples.
+    assert isinstance(sparql, AnzoGraphEngine)
+
+    edges_bnode_filter = "" if settings.include_bnodes else "FILTER(!isBlank(?s) && !isBlank(?o))"
+    edges_q = f"""
+SELECT ?s ?p ?o
+WHERE {{
+  ?s ?p ?o .
+  FILTER(!isLiteral(?o))
+  FILTER(?p NOT IN (
+    <http://www.w3.org/2000/01/rdf-schema#label>,
+    <http://www.w3.org/2004/02/skos/core#prefLabel>,
+    <http://www.w3.org/2004/02/skos/core#altLabel>
+  ))
+  {edges_bnode_filter}
+}}
+LIMIT {edge_limit}
+"""
+
+    res = await sparql.query_json(edges_q)
+    bindings = (((res.get("results") or {}).get("bindings")) or [])
+
+    node_id_by_key: dict[tuple[str, str], int] = {}
+    node_meta: list[tuple[str, str]] = []  # (termType, iri)
+    out_edges: list[dict[str, object]] = []
+
+    def _term_to_key_and_iri(term: dict[str, str]) -> tuple[tuple[str, str], tuple[str, str]] | None:
+        t = term.get("type")
+        v = term.get("value")
+        if not t or v is None:
+            return None
+        if t == "literal":
+            return None
+        if t == "bnode" and not settings.include_bnodes:
+            return None
+        if t == "bnode":
+            return (("bnode", v), ("bnode", f"_:{v}"))
+        # Default to "uri".
+        return (("uri", v), ("uri", v))
+
+    def _get_or_add(term: dict[str, str]) -> int | None:
+        out = _term_to_key_and_iri(term)
+        if out is None:
+            return None
+        key, meta = out
+        existing = node_id_by_key.get(key)
+        if existing is not None:
+            return existing
+        if len(node_meta) >= node_limit:
+            return None
+        nid = len(node_meta)
+        node_id_by_key[key] = nid
+        node_meta.append(meta)
+        return nid
+
+    for b in bindings:
+        s_term = b.get("s") or {}
+        o_term = b.get("o") or {}
+        p_term = b.get("p") or {}
+
+        sid = _get_or_add(s_term)
+        oid = _get_or_add(o_term)
+        if sid is None or oid is None:
+            continue
+
+        pred = p_term.get("value")
+        if not pred:
+            continue
+
+        out_edges.append({"source": sid, "target": oid, "predicate": pred})
+
+    out_nodes = [
+        {"id": i, "termType": term_type, "iri": iri, "label": None}
+        for i, (term_type, iri) in enumerate(node_meta)
+    ]
+
+    return GraphResponse(nodes=out_nodes, edges=out_edges)
--- a/backend/app/models.py
+++ b/backend/app/models.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+from pydantic import BaseModel
+
+
+class Node(BaseModel):
+    id: int
+    termType: str  # "uri" | "bnode"
+    iri: str
+    label: str | None = None
+
+
+class Edge(BaseModel):
+    source: int
+    target: int
+    predicate: str
+
+
+class StatsResponse(BaseModel):
+    backend: str
+    ttl_path: str
+    sparql_endpoint: str | None = None
+    parsed_triples: int
+    nodes: int
+    edges: int
+
+
+class NodesResponse(BaseModel):
+    total: int
+    nodes: list[Node]
+
+
+class EdgesResponse(BaseModel):
+    total: int
+    edges: list[Edge]
+
+
+class GraphResponse(BaseModel):
+    nodes: list[Node]
+    edges: list[Edge]
+
+
+class SparqlQueryRequest(BaseModel):
+    query: str
--- a/backend/app/rdf_store.py
+++ b/backend/app/rdf_store.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from rdflib import BNode, Graph, Literal, URIRef
+from rdflib.namespace import RDFS, SKOS
+
+
+LABEL_PREDICATES = {RDFS.label, SKOS.prefLabel, SKOS.altLabel}
+
+
+@dataclass(frozen=True)
+class EdgeRow:
+    source: int
+    target: int
+    predicate: str
+
+
+class RDFStore:
+    def __init__(self, *, ttl_path: str, include_bnodes: bool, max_triples: int | None):
+        self.ttl_path = ttl_path
+        self.include_bnodes = include_bnodes
+        self.max_triples = max_triples
+
+        self.graph: Graph | None = None
+
+        self._id_by_term: dict[Any, int] = {}
+        self._term_by_id: list[Any] = []
+
+        self._labels_by_id: dict[int, str] = {}
+        self._edges: list[EdgeRow] = []
+        self._parsed_triples = 0
+
+    def _term_allowed(self, term: Any) -> bool:
+        if isinstance(term, Literal):
+            return False
+        if isinstance(term, BNode) and not self.include_bnodes:
+            return False
+        return isinstance(term, (URIRef, BNode))
+
+    def _get_id(self, term: Any) -> int | None:
+        if not self._term_allowed(term):
+            return None
+        existing = self._id_by_term.get(term)
+        if existing is not None:
+            return existing
+        nid = len(self._term_by_id)
+        self._id_by_term[term] = nid
+        self._term_by_id.append(term)
+        return nid
+
+    def _term_type(self, term: Any) -> str:
+        if isinstance(term, BNode):
+            return "bnode"
+        return "uri"
+
+    def _term_iri(self, term: Any) -> str:
+        if isinstance(term, BNode):
+            return f"_:{term}"
+        return str(term)
+
+    def load(self, graph: Graph | None = None) -> None:
+        g = graph or Graph()
+        if graph is None:
+            g.parse(self.ttl_path, format="turtle")
+        self.graph = g
+
+        self._id_by_term.clear()
+        self._term_by_id.clear()
+        self._labels_by_id.clear()
+        self._edges.clear()
+
+        parsed = 0
+        for (s, p, o) in g:
+            parsed += 1
+            if self.max_triples is not None and parsed > self.max_triples:
+                break
+
+            # Capture labels but do not emit them as edges.
+            if p in LABEL_PREDICATES and isinstance(o, Literal):
+                sid = self._get_id(s)
+                if sid is not None and sid not in self._labels_by_id:
+                    self._labels_by_id[sid] = str(o)
+                continue
+
+            sid = self._get_id(s)
+            oid = self._get_id(o)
+            if sid is None or oid is None:
+                continue
+
+            self._edges.append(EdgeRow(source=sid, target=oid, predicate=str(p)))
+
+        self._parsed_triples = parsed
+
+    @property
+    def parsed_triples(self) -> int:
+        return self._parsed_triples
+
+    @property
+    def node_count(self) -> int:
+        return len(self._term_by_id)
+
+    @property
+    def edge_count(self) -> int:
+        return len(self._edges)
+
+    def node_slice(self, *, offset: int, limit: int) -> list[dict[str, Any]]:
+        end = min(self.node_count, offset + limit)
+        out: list[dict[str, Any]] = []
+        for nid in range(offset, end):
+            term = self._term_by_id[nid]
+            out.append(
+                {
+                    "id": nid,
+                    "termType": self._term_type(term),
+                    "iri": self._term_iri(term),
+                    "label": self._labels_by_id.get(nid),
+                }
+            )
+        return out
+
+    def edge_slice(self, *, offset: int, limit: int) -> list[dict[str, Any]]:
+        end = min(self.edge_count, offset + limit)
+        out: list[dict[str, Any]] = []
+        for row in self._edges[offset:end]:
+            out.append(
+                {
+                    "source": row.source,
+                    "target": row.target,
+                    "predicate": row.predicate,
+                }
+            )
+        return out
--- a/backend/app/settings.py
+++ b/backend/app/settings.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    # Which graph engine executes SPARQL queries.
+    # - rdflib: parse TTL locally and query in-memory
+    # - anzograph: query a remote AnzoGraph SPARQL endpoint (optionally LOAD on startup)
+    graph_backend: Literal["rdflib", "anzograph"] = Field(default="rdflib", alias="GRAPH_BACKEND")
+
+    ttl_path: str = Field(default="/data/o3po.ttl", alias="TTL_PATH")
+    include_bnodes: bool = Field(default=False, alias="INCLUDE_BNODES")
+    max_triples: int | None = Field(default=None, alias="MAX_TRIPLES")
+
+    # AnzoGraph / SPARQL endpoint configuration
+    sparql_host: str = Field(default="http://anzograph:8080", alias="SPARQL_HOST")
+    # If not set, the backend uses `${SPARQL_HOST}/sparql`.
+    sparql_endpoint: str | None = Field(default=None, alias="SPARQL_ENDPOINT")
+    sparql_user: str | None = Field(default=None, alias="SPARQL_USER")
+    sparql_pass: str | None = Field(default=None, alias="SPARQL_PASS")
+
+    # File URI as seen by the AnzoGraph container (used with SPARQL `LOAD`).
+    # Example: file:///opt/shared-files/o3po.ttl
+    sparql_data_file: str | None = Field(default=None, alias="SPARQL_DATA_FILE")
+    sparql_graph_iri: str | None = Field(default=None, alias="SPARQL_GRAPH_IRI")
+    sparql_load_on_start: bool = Field(default=False, alias="SPARQL_LOAD_ON_START")
+    sparql_clear_on_start: bool = Field(default=False, alias="SPARQL_CLEAR_ON_START")
+
+    sparql_timeout_s: float = Field(default=300.0, alias="SPARQL_TIMEOUT_S")
+    sparql_ready_retries: int = Field(default=30, alias="SPARQL_READY_RETRIES")
+    sparql_ready_delay_s: float = Field(default=4.0, alias="SPARQL_READY_DELAY_S")
+
+    # Comma-separated, or "*" (default).
+    cors_origins: str = Field(default="*", alias="CORS_ORIGINS")
+
+    model_config = SettingsConfigDict(env_file=".env", extra="ignore")
+
+    def cors_origin_list(self) -> list[str]:
+        if self.cors_origins.strip() == "*":
+            return ["*"]
+        return [o.strip() for o in self.cors_origins.split(",") if o.strip()]
+
+    def effective_sparql_endpoint(self) -> str:
+        if self.sparql_endpoint and self.sparql_endpoint.strip():
+            return self.sparql_endpoint.strip()
+        return self.sparql_host.rstrip("/") + "/sparql"
--- a/backend/app/sparql_engine.py
+++ b/backend/app/sparql_engine.py
@@ -0,0 +1,155 @@
+from __future__ import annotations
+
+import asyncio
+import base64
+import json
+from typing import Any, Protocol
+
+import httpx
+from rdflib import Graph
+
+from .settings import Settings
+
+
+class SparqlEngine(Protocol):
+    name: str
+
+    async def startup(self) -> None: ...
+
+    async def shutdown(self) -> None: ...
+
+    async def query_json(self, query: str) -> dict[str, Any]: ...
+
+
+class RdflibEngine:
+    name = "rdflib"
+
+    def __init__(self, *, ttl_path: str):
+        self.ttl_path = ttl_path
+        self.graph: Graph | None = None
+
+    async def startup(self) -> None:
+        g = Graph()
+        g.parse(self.ttl_path, format="turtle")
+        self.graph = g
+
+    async def shutdown(self) -> None:
+        # Nothing to close for in-memory rdflib graph.
+        return None
+
+    async def query_json(self, query: str) -> dict[str, Any]:
+        if self.graph is None:
+            raise RuntimeError("RdflibEngine not started")
+
+        result = self.graph.query(query)
+        payload = result.serialize(format="json")
+        if isinstance(payload, bytes):
+            payload = payload.decode("utf-8")
+        return json.loads(payload)
+
+
+class AnzoGraphEngine:
+    name = "anzograph"
+
+    def __init__(self, *, settings: Settings):
+        self.endpoint = settings.effective_sparql_endpoint()
+        self.timeout_s = settings.sparql_timeout_s
+        self.ready_retries = settings.sparql_ready_retries
+        self.ready_delay_s = settings.sparql_ready_delay_s
+
+        self.user = settings.sparql_user
+        self.password = settings.sparql_pass
+        self.data_file = settings.sparql_data_file
+        self.graph_iri = settings.sparql_graph_iri
+        self.load_on_start = settings.sparql_load_on_start
+        self.clear_on_start = settings.sparql_clear_on_start
+
+        self._client: httpx.AsyncClient | None = None
+        self._auth_header = self._build_auth_header(self.user, self.password)
+
+    @staticmethod
+    def _build_auth_header(user: str | None, password: str | None) -> str | None:
+        if not user or not password:
+            return None
+        token = base64.b64encode(f"{user}:{password}".encode("utf-8")).decode("ascii")
+        return f"Basic {token}"
+
+    async def startup(self) -> None:
+        self._client = httpx.AsyncClient(timeout=self.timeout_s)
+
+        await self._wait_ready()
+
+        if self.clear_on_start:
+            await self._update("CLEAR ALL")
+            await self._wait_ready()
+
+        if self.load_on_start:
+            if not self.data_file:
+                raise RuntimeError("SPARQL_LOAD_ON_START=true but SPARQL_DATA_FILE is not set")
+
+            if self.graph_iri:
+                await self._update(f"LOAD <{self.data_file}> INTO GRAPH <{self.graph_iri}>")
+            else:
+                await self._update(f"LOAD <{self.data_file}>")
+
+            # AnzoGraph may still be indexing after LOAD.
+            await self._wait_ready()
+
+    async def shutdown(self) -> None:
+        if self._client is not None:
+            await self._client.aclose()
+            self._client = None
+
+    async def query_json(self, query: str) -> dict[str, Any]:
+        if self._client is None:
+            raise RuntimeError("AnzoGraphEngine not started")
+
+        headers = {
+            "Content-Type": "application/x-www-form-urlencoded",
+            "Accept": "application/sparql-results+json",
+        }
+        if self._auth_header:
+            headers["Authorization"] = self._auth_header
+
+        # AnzoGraph expects x-www-form-urlencoded with `query=...`.
+        resp = await self._client.post(
+            self.endpoint,
+            headers=headers,
+            data={"query": query},
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+    async def _update(self, update: str) -> None:
+        if self._client is None:
+            raise RuntimeError("AnzoGraphEngine not started")
+
+        headers = {
+            "Content-Type": "application/sparql-update",
+            "Accept": "application/json",
+        }
+        if self._auth_header:
+            headers["Authorization"] = self._auth_header
+
+        resp = await self._client.post(self.endpoint, headers=headers, content=update)
+        resp.raise_for_status()
+
+    async def _wait_ready(self) -> None:
+        last_err: Exception | None = None
+        for _ in range(self.ready_retries):
+            try:
+                # Keep it cheap and JSON-parseable.
+                await self.query_json("ASK WHERE { ?s ?p ?o }")
+                return
+            except Exception as e:
+                last_err = e
+                await asyncio.sleep(self.ready_delay_s)
+        raise RuntimeError(f"AnzoGraph not ready at {self.endpoint}") from last_err
+
+
+def create_sparql_engine(settings: Settings) -> SparqlEngine:
+    if settings.graph_backend == "rdflib":
+        return RdflibEngine(ttl_path=settings.ttl_path)
+    if settings.graph_backend == "anzograph":
+        return AnzoGraphEngine(settings=settings)
+    raise RuntimeError(f"Unsupported GRAPH_BACKEND={settings.graph_backend!r}")