This commit is contained in:
Oxy8
2026-03-02 14:32:42 -03:00
parent 022da71e6a
commit bf03d333f9
29 changed files with 200764 additions and 200011 deletions

1
backend/app/__init__.py Normal file
View File

@@ -0,0 +1 @@

247
backend/app/main.py Normal file
View File

@@ -0,0 +1,247 @@
from __future__ import annotations
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
from .models import EdgesResponse, GraphResponse, NodesResponse, SparqlQueryRequest, StatsResponse
from .rdf_store import RDFStore
from .sparql_engine import AnzoGraphEngine, RdflibEngine, SparqlEngine, create_sparql_engine
from .settings import Settings
settings = Settings()
@asynccontextmanager
async def lifespan(app: FastAPI):
sparql: SparqlEngine = create_sparql_engine(settings)
await sparql.startup()
app.state.sparql = sparql
# Only build node/edge tables when running in rdflib mode.
if settings.graph_backend == "rdflib":
assert isinstance(sparql, RdflibEngine)
if sparql.graph is None:
raise RuntimeError("rdflib graph failed to load")
store = RDFStore(
ttl_path=settings.ttl_path,
include_bnodes=settings.include_bnodes,
max_triples=settings.max_triples,
)
store.load(sparql.graph)
app.state.store = store
yield
await sparql.shutdown()
app = FastAPI(title="visualizador_instanciados backend", lifespan=lifespan)
cors_origins = settings.cors_origin_list()
app.add_middleware(
CORSMiddleware,
allow_origins=cors_origins,
allow_credentials=False,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/api/health")
def health() -> dict[str, str]:
return {"status": "ok"}
@app.get("/api/stats", response_model=StatsResponse)
async def stats() -> StatsResponse:
sparql: SparqlEngine = app.state.sparql
if settings.graph_backend == "rdflib":
store: RDFStore = app.state.store
return StatsResponse(
backend=sparql.name,
ttl_path=settings.ttl_path,
sparql_endpoint=None,
parsed_triples=store.parsed_triples,
nodes=store.node_count,
edges=store.edge_count,
)
# AnzoGraph: compute basic counts via SPARQL.
assert isinstance(sparql, AnzoGraphEngine)
def _count_from(result: dict, *, var: str = "count") -> int:
bindings = (((result.get("results") or {}).get("bindings")) or [])
if not bindings:
return 0
raw = bindings[0].get(var, {}).get("value")
try:
return int(raw)
except Exception:
return 0
bnode_filter = "" if settings.include_bnodes else "FILTER(!isBlank(?n))"
nodes_q = f"""
SELECT (COUNT(DISTINCT ?n) AS ?count)
WHERE {{
{{ ?n ?p ?o }} UNION {{ ?s ?p ?n }}
FILTER(!isLiteral(?n))
{bnode_filter}
}}
"""
triples_q = "SELECT (COUNT(*) AS ?count) WHERE { ?s ?p ?o }"
# Approximate "edges" similarly to our rdflib export: non-literal object, and skip label predicates.
edges_bnode_filter = "" if settings.include_bnodes else "FILTER(!isBlank(?s) && !isBlank(?o))"
edges_q = f"""
SELECT (COUNT(*) AS ?count)
WHERE {{
?s ?p ?o .
FILTER(!isLiteral(?o))
FILTER(?p NOT IN (
<http://www.w3.org/2000/01/rdf-schema#label>,
<http://www.w3.org/2004/02/skos/core#prefLabel>,
<http://www.w3.org/2004/02/skos/core#altLabel>
))
{edges_bnode_filter}
}}
"""
triples_res = await sparql.query_json(triples_q)
nodes_res = await sparql.query_json(nodes_q)
edges_res = await sparql.query_json(edges_q)
return StatsResponse(
backend=sparql.name,
ttl_path=settings.ttl_path,
sparql_endpoint=settings.effective_sparql_endpoint(),
parsed_triples=_count_from(triples_res),
nodes=_count_from(nodes_res),
edges=_count_from(edges_res),
)
@app.post("/api/sparql")
async def sparql_query(req: SparqlQueryRequest) -> dict:
sparql: SparqlEngine = app.state.sparql
data = await sparql.query_json(req.query)
return data
@app.get("/api/nodes", response_model=NodesResponse)
def nodes(
limit: int = Query(default=10_000, ge=1, le=200_000),
offset: int = Query(default=0, ge=0),
) -> NodesResponse:
if settings.graph_backend != "rdflib":
raise HTTPException(status_code=501, detail="GET /api/nodes is only supported in GRAPH_BACKEND=rdflib mode")
store: RDFStore = app.state.store
return NodesResponse(total=store.node_count, nodes=store.node_slice(offset=offset, limit=limit))
@app.get("/api/edges", response_model=EdgesResponse)
def edges(
limit: int = Query(default=50_000, ge=1, le=500_000),
offset: int = Query(default=0, ge=0),
) -> EdgesResponse:
if settings.graph_backend != "rdflib":
raise HTTPException(status_code=501, detail="GET /api/edges is only supported in GRAPH_BACKEND=rdflib mode")
store: RDFStore = app.state.store
return EdgesResponse(total=store.edge_count, edges=store.edge_slice(offset=offset, limit=limit))
@app.get("/api/graph", response_model=GraphResponse)
async def graph(
node_limit: int = Query(default=50_000, ge=1, le=200_000),
edge_limit: int = Query(default=100_000, ge=1, le=500_000),
) -> GraphResponse:
sparql: SparqlEngine = app.state.sparql
if settings.graph_backend == "rdflib":
store: RDFStore = app.state.store
return GraphResponse(
nodes=store.node_slice(offset=0, limit=node_limit),
edges=store.edge_slice(offset=0, limit=edge_limit),
)
# AnzoGraph mode: return a simple subgraph by pulling the first N triples.
assert isinstance(sparql, AnzoGraphEngine)
edges_bnode_filter = "" if settings.include_bnodes else "FILTER(!isBlank(?s) && !isBlank(?o))"
edges_q = f"""
SELECT ?s ?p ?o
WHERE {{
?s ?p ?o .
FILTER(!isLiteral(?o))
FILTER(?p NOT IN (
<http://www.w3.org/2000/01/rdf-schema#label>,
<http://www.w3.org/2004/02/skos/core#prefLabel>,
<http://www.w3.org/2004/02/skos/core#altLabel>
))
{edges_bnode_filter}
}}
LIMIT {edge_limit}
"""
res = await sparql.query_json(edges_q)
bindings = (((res.get("results") or {}).get("bindings")) or [])
node_id_by_key: dict[tuple[str, str], int] = {}
node_meta: list[tuple[str, str]] = [] # (termType, iri)
out_edges: list[dict[str, object]] = []
def _term_to_key_and_iri(term: dict[str, str]) -> tuple[tuple[str, str], tuple[str, str]] | None:
t = term.get("type")
v = term.get("value")
if not t or v is None:
return None
if t == "literal":
return None
if t == "bnode" and not settings.include_bnodes:
return None
if t == "bnode":
return (("bnode", v), ("bnode", f"_:{v}"))
# Default to "uri".
return (("uri", v), ("uri", v))
def _get_or_add(term: dict[str, str]) -> int | None:
out = _term_to_key_and_iri(term)
if out is None:
return None
key, meta = out
existing = node_id_by_key.get(key)
if existing is not None:
return existing
if len(node_meta) >= node_limit:
return None
nid = len(node_meta)
node_id_by_key[key] = nid
node_meta.append(meta)
return nid
for b in bindings:
s_term = b.get("s") or {}
o_term = b.get("o") or {}
p_term = b.get("p") or {}
sid = _get_or_add(s_term)
oid = _get_or_add(o_term)
if sid is None or oid is None:
continue
pred = p_term.get("value")
if not pred:
continue
out_edges.append({"source": sid, "target": oid, "predicate": pred})
out_nodes = [
{"id": i, "termType": term_type, "iri": iri, "label": None}
for i, (term_type, iri) in enumerate(node_meta)
]
return GraphResponse(nodes=out_nodes, edges=out_edges)

44
backend/app/models.py Normal file
View File

@@ -0,0 +1,44 @@
from __future__ import annotations
from pydantic import BaseModel
class Node(BaseModel):
id: int
termType: str # "uri" | "bnode"
iri: str
label: str | None = None
class Edge(BaseModel):
source: int
target: int
predicate: str
class StatsResponse(BaseModel):
backend: str
ttl_path: str
sparql_endpoint: str | None = None
parsed_triples: int
nodes: int
edges: int
class NodesResponse(BaseModel):
total: int
nodes: list[Node]
class EdgesResponse(BaseModel):
total: int
edges: list[Edge]
class GraphResponse(BaseModel):
nodes: list[Node]
edges: list[Edge]
class SparqlQueryRequest(BaseModel):
query: str

134
backend/app/rdf_store.py Normal file
View File

@@ -0,0 +1,134 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
from rdflib import BNode, Graph, Literal, URIRef
from rdflib.namespace import RDFS, SKOS
LABEL_PREDICATES = {RDFS.label, SKOS.prefLabel, SKOS.altLabel}
@dataclass(frozen=True)
class EdgeRow:
source: int
target: int
predicate: str
class RDFStore:
def __init__(self, *, ttl_path: str, include_bnodes: bool, max_triples: int | None):
self.ttl_path = ttl_path
self.include_bnodes = include_bnodes
self.max_triples = max_triples
self.graph: Graph | None = None
self._id_by_term: dict[Any, int] = {}
self._term_by_id: list[Any] = []
self._labels_by_id: dict[int, str] = {}
self._edges: list[EdgeRow] = []
self._parsed_triples = 0
def _term_allowed(self, term: Any) -> bool:
if isinstance(term, Literal):
return False
if isinstance(term, BNode) and not self.include_bnodes:
return False
return isinstance(term, (URIRef, BNode))
def _get_id(self, term: Any) -> int | None:
if not self._term_allowed(term):
return None
existing = self._id_by_term.get(term)
if existing is not None:
return existing
nid = len(self._term_by_id)
self._id_by_term[term] = nid
self._term_by_id.append(term)
return nid
def _term_type(self, term: Any) -> str:
if isinstance(term, BNode):
return "bnode"
return "uri"
def _term_iri(self, term: Any) -> str:
if isinstance(term, BNode):
return f"_:{term}"
return str(term)
def load(self, graph: Graph | None = None) -> None:
g = graph or Graph()
if graph is None:
g.parse(self.ttl_path, format="turtle")
self.graph = g
self._id_by_term.clear()
self._term_by_id.clear()
self._labels_by_id.clear()
self._edges.clear()
parsed = 0
for (s, p, o) in g:
parsed += 1
if self.max_triples is not None and parsed > self.max_triples:
break
# Capture labels but do not emit them as edges.
if p in LABEL_PREDICATES and isinstance(o, Literal):
sid = self._get_id(s)
if sid is not None and sid not in self._labels_by_id:
self._labels_by_id[sid] = str(o)
continue
sid = self._get_id(s)
oid = self._get_id(o)
if sid is None or oid is None:
continue
self._edges.append(EdgeRow(source=sid, target=oid, predicate=str(p)))
self._parsed_triples = parsed
@property
def parsed_triples(self) -> int:
return self._parsed_triples
@property
def node_count(self) -> int:
return len(self._term_by_id)
@property
def edge_count(self) -> int:
return len(self._edges)
def node_slice(self, *, offset: int, limit: int) -> list[dict[str, Any]]:
end = min(self.node_count, offset + limit)
out: list[dict[str, Any]] = []
for nid in range(offset, end):
term = self._term_by_id[nid]
out.append(
{
"id": nid,
"termType": self._term_type(term),
"iri": self._term_iri(term),
"label": self._labels_by_id.get(nid),
}
)
return out
def edge_slice(self, *, offset: int, limit: int) -> list[dict[str, Any]]:
end = min(self.edge_count, offset + limit)
out: list[dict[str, Any]] = []
for row in self._edges[offset:end]:
out.append(
{
"source": row.source,
"target": row.target,
"predicate": row.predicate,
}
)
return out

50
backend/app/settings.py Normal file
View File

@@ -0,0 +1,50 @@
from __future__ import annotations
from typing import Literal
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
# Which graph engine executes SPARQL queries.
# - rdflib: parse TTL locally and query in-memory
# - anzograph: query a remote AnzoGraph SPARQL endpoint (optionally LOAD on startup)
graph_backend: Literal["rdflib", "anzograph"] = Field(default="rdflib", alias="GRAPH_BACKEND")
ttl_path: str = Field(default="/data/o3po.ttl", alias="TTL_PATH")
include_bnodes: bool = Field(default=False, alias="INCLUDE_BNODES")
max_triples: int | None = Field(default=None, alias="MAX_TRIPLES")
# AnzoGraph / SPARQL endpoint configuration
sparql_host: str = Field(default="http://anzograph:8080", alias="SPARQL_HOST")
# If not set, the backend uses `${SPARQL_HOST}/sparql`.
sparql_endpoint: str | None = Field(default=None, alias="SPARQL_ENDPOINT")
sparql_user: str | None = Field(default=None, alias="SPARQL_USER")
sparql_pass: str | None = Field(default=None, alias="SPARQL_PASS")
# File URI as seen by the AnzoGraph container (used with SPARQL `LOAD`).
# Example: file:///opt/shared-files/o3po.ttl
sparql_data_file: str | None = Field(default=None, alias="SPARQL_DATA_FILE")
sparql_graph_iri: str | None = Field(default=None, alias="SPARQL_GRAPH_IRI")
sparql_load_on_start: bool = Field(default=False, alias="SPARQL_LOAD_ON_START")
sparql_clear_on_start: bool = Field(default=False, alias="SPARQL_CLEAR_ON_START")
sparql_timeout_s: float = Field(default=300.0, alias="SPARQL_TIMEOUT_S")
sparql_ready_retries: int = Field(default=30, alias="SPARQL_READY_RETRIES")
sparql_ready_delay_s: float = Field(default=4.0, alias="SPARQL_READY_DELAY_S")
# Comma-separated, or "*" (default).
cors_origins: str = Field(default="*", alias="CORS_ORIGINS")
model_config = SettingsConfigDict(env_file=".env", extra="ignore")
def cors_origin_list(self) -> list[str]:
if self.cors_origins.strip() == "*":
return ["*"]
return [o.strip() for o in self.cors_origins.split(",") if o.strip()]
def effective_sparql_endpoint(self) -> str:
if self.sparql_endpoint and self.sparql_endpoint.strip():
return self.sparql_endpoint.strip()
return self.sparql_host.rstrip("/") + "/sparql"

View File

@@ -0,0 +1,155 @@
from __future__ import annotations
import asyncio
import base64
import json
from typing import Any, Protocol
import httpx
from rdflib import Graph
from .settings import Settings
class SparqlEngine(Protocol):
name: str
async def startup(self) -> None: ...
async def shutdown(self) -> None: ...
async def query_json(self, query: str) -> dict[str, Any]: ...
class RdflibEngine:
name = "rdflib"
def __init__(self, *, ttl_path: str):
self.ttl_path = ttl_path
self.graph: Graph | None = None
async def startup(self) -> None:
g = Graph()
g.parse(self.ttl_path, format="turtle")
self.graph = g
async def shutdown(self) -> None:
# Nothing to close for in-memory rdflib graph.
return None
async def query_json(self, query: str) -> dict[str, Any]:
if self.graph is None:
raise RuntimeError("RdflibEngine not started")
result = self.graph.query(query)
payload = result.serialize(format="json")
if isinstance(payload, bytes):
payload = payload.decode("utf-8")
return json.loads(payload)
class AnzoGraphEngine:
name = "anzograph"
def __init__(self, *, settings: Settings):
self.endpoint = settings.effective_sparql_endpoint()
self.timeout_s = settings.sparql_timeout_s
self.ready_retries = settings.sparql_ready_retries
self.ready_delay_s = settings.sparql_ready_delay_s
self.user = settings.sparql_user
self.password = settings.sparql_pass
self.data_file = settings.sparql_data_file
self.graph_iri = settings.sparql_graph_iri
self.load_on_start = settings.sparql_load_on_start
self.clear_on_start = settings.sparql_clear_on_start
self._client: httpx.AsyncClient | None = None
self._auth_header = self._build_auth_header(self.user, self.password)
@staticmethod
def _build_auth_header(user: str | None, password: str | None) -> str | None:
if not user or not password:
return None
token = base64.b64encode(f"{user}:{password}".encode("utf-8")).decode("ascii")
return f"Basic {token}"
async def startup(self) -> None:
self._client = httpx.AsyncClient(timeout=self.timeout_s)
await self._wait_ready()
if self.clear_on_start:
await self._update("CLEAR ALL")
await self._wait_ready()
if self.load_on_start:
if not self.data_file:
raise RuntimeError("SPARQL_LOAD_ON_START=true but SPARQL_DATA_FILE is not set")
if self.graph_iri:
await self._update(f"LOAD <{self.data_file}> INTO GRAPH <{self.graph_iri}>")
else:
await self._update(f"LOAD <{self.data_file}>")
# AnzoGraph may still be indexing after LOAD.
await self._wait_ready()
async def shutdown(self) -> None:
if self._client is not None:
await self._client.aclose()
self._client = None
async def query_json(self, query: str) -> dict[str, Any]:
if self._client is None:
raise RuntimeError("AnzoGraphEngine not started")
headers = {
"Content-Type": "application/x-www-form-urlencoded",
"Accept": "application/sparql-results+json",
}
if self._auth_header:
headers["Authorization"] = self._auth_header
# AnzoGraph expects x-www-form-urlencoded with `query=...`.
resp = await self._client.post(
self.endpoint,
headers=headers,
data={"query": query},
)
resp.raise_for_status()
return resp.json()
async def _update(self, update: str) -> None:
if self._client is None:
raise RuntimeError("AnzoGraphEngine not started")
headers = {
"Content-Type": "application/sparql-update",
"Accept": "application/json",
}
if self._auth_header:
headers["Authorization"] = self._auth_header
resp = await self._client.post(self.endpoint, headers=headers, content=update)
resp.raise_for_status()
async def _wait_ready(self) -> None:
last_err: Exception | None = None
for _ in range(self.ready_retries):
try:
# Keep it cheap and JSON-parseable.
await self.query_json("ASK WHERE { ?s ?p ?o }")
return
except Exception as e:
last_err = e
await asyncio.sleep(self.ready_delay_s)
raise RuntimeError(f"AnzoGraph not ready at {self.endpoint}") from last_err
def create_sparql_engine(settings: Settings) -> SparqlEngine:
if settings.graph_backend == "rdflib":
return RdflibEngine(ttl_path=settings.ttl_path)
if settings.graph_backend == "anzograph":
return AnzoGraphEngine(settings=settings)
raise RuntimeError(f"Unsupported GRAPH_BACKEND={settings.graph_backend!r}")