Import Solver + neighbors via sparql query

This commit is contained in:
Oxy8
2026-03-04 13:49:14 -03:00
parent d4bfa5f064
commit a75b5b93da
15 changed files with 747 additions and 463 deletions

View File

@@ -1,30 +0,0 @@
# Choose which engine executes SPARQL:
# - rdflib: parse TTL locally and query in-memory
# - anzograph: query AnzoGraph over HTTP (optionally LOAD the TTL on startup)
GRAPH_BACKEND=rdflib
# Backend (rdflib) file location inside the container.
# The TTL file must exist within the mounted ./data folder if you keep the default volume mount.
TTL_PATH=/data/o3po.ttl
# Backend behavior
INCLUDE_BNODES=false
# MAX_TRIPLES=1000000
# AnzoGraph / SPARQL endpoint settings (used when GRAPH_BACKEND=anzograph)
SPARQL_HOST=http://anzograph:8080
# SPARQL_ENDPOINT=http://anzograph:8080/sparql
SPARQL_USER=admin
SPARQL_PASS=Passw0rd1
# File URI as seen by the AnzoGraph container (used by SPARQL `LOAD`)
SPARQL_DATA_FILE=file:///opt/shared-files/o3po.ttl
# SPARQL_GRAPH_IRI=http://example.org/graph
# Startup behavior for AnzoGraph mode
SPARQL_LOAD_ON_START=false
SPARQL_CLEAR_ON_START=false
# Dev UX
CORS_ORIGINS=http://localhost:5173
VITE_BACKEND_URL=http://backend:8000

View File

@@ -32,6 +32,11 @@ Callers (frontend or other clients) interact with a single API surface (`/api/*`
- Used by `/api/nodes`, `/api/edges`, and `rdflib`-mode `/api/stats`.
- `pipelines/graph_snapshot.py`
- Pipeline used by `/api/graph` to return a `{nodes, edges}` snapshot via SPARQL (works for both RDFLib and AnzoGraph).
- `pipelines/layout_dag_radial.py`
- DAG layout helpers used by `pipelines/graph_snapshot.py`:
- cycle detection
- level-synchronous Kahn layering
- radial (ring-per-layer) positioning.
- `pipelines/snapshot_service.py`
- Snapshot cache layer used by `/api/graph` and `/api/stats` so the backend doesn't run expensive SPARQL twice.
- `pipelines/subclass_labels.py`
@@ -64,6 +69,14 @@ RDFLib mode:
- `TTL_PATH`: path inside the backend container to a `.ttl` file (example: `/data/o3po.ttl`)
- `MAX_TRIPLES`: optional int; if set, stops parsing after this many triples
Optional import-combining step (runs before the SPARQL engine starts):
- `COMBINE_OWL_IMPORTS_ON_START`: `true` to recursively load `TTL_PATH` (or `COMBINE_ENTRY_LOCATION`) plus `owl:imports` and write a combined TTL file.
- `COMBINE_ENTRY_LOCATION`: optional override for the entry file/URL to load (defaults to `TTL_PATH`)
- `COMBINE_OUTPUT_LOCATION`: optional explicit output path (defaults to `${dirname(entry)}/${COMBINE_OUTPUT_NAME}`)
- `COMBINE_OUTPUT_NAME`: output filename when `COMBINE_OUTPUT_LOCATION` is not set (default: `combined_ontology.ttl`)
- `COMBINE_FORCE`: `true` to rebuild even if the output file already exists
AnzoGraph mode:
- `SPARQL_HOST`: base host (example: `http://anzograph:8080`)
@@ -129,8 +142,8 @@ Returned in `nodes[]` (dense IDs; suitable for indexing in typed arrays):
- `id`: integer dense node ID used in edges
- `termType`: `"uri"` or `"bnode"`
- `iri`: URI string; blank nodes are normalized to `_:<id>`
- `label`: currently `null` in `/api/graph` snapshots (pipelines can be used to populate later)
- `x`/`y`: world-space coordinates for rendering (currently a deterministic spiral layout)
- `label`: `rdfs:label` when available (best-effort; prefers English)
- `x`/`y`: world-space coordinates for rendering (currently a radial layered layout derived from `rdfs:subClassOf`)
### Edge
@@ -149,11 +162,10 @@ Returned in `edges[]`:
## Snapshot Query (`/api/graph`)
`/api/graph` uses a SPARQL query that:
`/api/graph` currently uses a SPARQL query that returns only `rdfs:subClassOf` edges:
- selects triples `?s ?p ?o`
- excludes literal objects (`FILTER(!isLiteral(?o))`)
- excludes `rdfs:label`, `skos:prefLabel`, and `skos:altLabel` predicates
- selects bindings as `?s ?p ?o` (with `?p` bound to `rdfs:subClassOf`)
- excludes literal objects (`FILTER(!isLiteral(?o))`) for safety
- optionally excludes blank nodes (unless `INCLUDE_BNODES=true`)
- applies `LIMIT edge_limit`
@@ -161,6 +173,8 @@ The result bindings are mapped to dense node IDs (first-seen order) and returned
`/api/graph` also returns `meta` with snapshot counts and engine info so the frontend doesn't need to call `/api/stats`.
If a cycle is detected in the returned `rdfs:subClassOf` snapshot, `/api/graph` returns HTTP 422 (layout requires a DAG).
## Pipelines
### `pipelines/graph_snapshot.py`

View File

@@ -5,16 +5,25 @@ from typing import Any
def edge_retrieval_query(*, edge_limit: int, include_bnodes: bool) -> str:
bnode_filter = "" if include_bnodes else "FILTER(!isBlank(?s) && !isBlank(?o))"
return f"""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT ?s ?p ?o
WHERE {{
?s ?p ?o .
{{
VALUES ?p {{ rdf:type }}
?s ?p ?o .
?o rdf:type owl:Class .
}}
UNION
{{
VALUES ?p {{ rdfs:subClassOf }}
?s ?p ?o .
}}
FILTER(!isLiteral(?o))
FILTER(?p NOT IN (
<http://www.w3.org/2000/01/rdf-schema#label>,
<http://www.w3.org/2004/02/skos/core#prefLabel>,
<http://www.w3.org/2004/02/skos/core#altLabel>
))
{bnode_filter}
}}
LIMIT {edge_limit}
@@ -91,4 +100,3 @@ def graph_from_sparql_bindings(
]
return out_nodes, out_edges

View File

@@ -1,11 +1,29 @@
from __future__ import annotations
from contextlib import asynccontextmanager
import logging
import asyncio
from fastapi import FastAPI, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
from .models import EdgesResponse, GraphResponse, NodesResponse, SparqlQueryRequest, StatsResponse
from .models import (
EdgesResponse,
GraphResponse,
NeighborsRequest,
NeighborsResponse,
NodesResponse,
SparqlQueryRequest,
StatsResponse,
)
from .pipelines.layout_dag_radial import CycleError
from .pipelines.owl_imports_combiner import (
build_combined_graph,
output_location_to_path,
resolve_output_location,
serialize_graph_to_ttl,
)
from .pipelines.selection_neighbors import fetch_neighbor_ids_for_selection
from .pipelines.snapshot_service import GraphSnapshotService
from .rdf_store import RDFStore
from .sparql_engine import RdflibEngine, SparqlEngine, create_sparql_engine
@@ -13,11 +31,33 @@ from .settings import Settings
settings = Settings()
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI):
sparql: SparqlEngine = create_sparql_engine(settings)
rdflib_preloaded_graph = None
if settings.combine_owl_imports_on_start:
entry_location = settings.combine_entry_location or settings.ttl_path
output_location = resolve_output_location(
entry_location,
output_location=settings.combine_output_location,
output_name=settings.combine_output_name,
)
output_path = output_location_to_path(output_location)
if output_path.exists() and not settings.combine_force:
logger.info("Skipping combine step (output exists): %s", output_location)
else:
rdflib_preloaded_graph = await asyncio.to_thread(build_combined_graph, entry_location)
logger.info("Finished combining imports; serializing to: %s", output_location)
await asyncio.to_thread(serialize_graph_to_ttl, rdflib_preloaded_graph, output_location)
if settings.graph_backend == "rdflib":
settings.ttl_path = str(output_path)
sparql: SparqlEngine = create_sparql_engine(settings, rdflib_graph=rdflib_preloaded_graph)
await sparql.startup()
app.state.sparql = sparql
app.state.snapshot_service = GraphSnapshotService(sparql=sparql, settings=settings)
@@ -62,7 +102,10 @@ def health() -> dict[str, str]:
async def stats() -> StatsResponse:
# Stats reflect exactly what we send to the frontend (/api/graph), not global graph size.
svc: GraphSnapshotService = app.state.snapshot_service
snap = await svc.get(node_limit=50_000, edge_limit=100_000)
try:
snap = await svc.get(node_limit=50_000, edge_limit=100_000)
except CycleError as e:
raise HTTPException(status_code=422, detail=str(e)) from None
meta = snap.meta
return StatsResponse(
backend=meta.backend if meta else app.state.sparql.name,
@@ -81,6 +124,20 @@ async def sparql_query(req: SparqlQueryRequest) -> dict:
return data
@app.post("/api/neighbors", response_model=NeighborsResponse)
async def neighbors(req: NeighborsRequest) -> NeighborsResponse:
svc: GraphSnapshotService = app.state.snapshot_service
snap = await svc.get(node_limit=req.node_limit, edge_limit=req.edge_limit)
sparql: SparqlEngine = app.state.sparql
neighbor_ids = await fetch_neighbor_ids_for_selection(
sparql,
snapshot=snap,
selected_ids=req.selected_ids,
include_bnodes=settings.include_bnodes,
)
return NeighborsResponse(selected_ids=req.selected_ids, neighbor_ids=neighbor_ids)
@app.get("/api/nodes", response_model=NodesResponse)
def nodes(
limit: int = Query(default=10_000, ge=1, le=200_000),
@@ -109,4 +166,7 @@ async def graph(
edge_limit: int = Query(default=100_000, ge=1, le=500_000),
) -> GraphResponse:
svc: GraphSnapshotService = app.state.snapshot_service
return await svc.get(node_limit=node_limit, edge_limit=edge_limit)
try:
return await svc.get(node_limit=node_limit, edge_limit=edge_limit)
except CycleError as e:
raise HTTPException(status_code=422, detail=str(e)) from None

View File

@@ -56,3 +56,14 @@ class GraphResponse(BaseModel):
class SparqlQueryRequest(BaseModel):
query: str
class NeighborsRequest(BaseModel):
selected_ids: list[int]
node_limit: int = 50_000
edge_limit: int = 100_000
class NeighborsResponse(BaseModel):
selected_ids: list[int]
neighbor_ids: list[int]

View File

@@ -1,10 +1,64 @@
from __future__ import annotations
from typing import Any
from ..graph_export import edge_retrieval_query, graph_from_sparql_bindings
from ..models import GraphResponse
from ..sparql_engine import SparqlEngine
from ..settings import Settings
from .layout_spiral import spiral_positions
from .layout_dag_radial import CycleError, level_synchronous_kahn_layers, radial_positions_from_layers
RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
def _bindings(res: dict[str, Any]) -> list[dict[str, Any]]:
return (((res.get("results") or {}).get("bindings")) or [])
def _label_score(label_binding: dict[str, Any]) -> int:
# Prefer English, then no-language, then anything else.
lang = (label_binding.get("xml:lang") or "").lower()
if lang == "en":
return 3
if lang == "":
return 2
return 1
async def _fetch_rdfs_labels_for_iris(
sparql: SparqlEngine,
iris: list[str],
*,
batch_size: int = 500,
) -> dict[str, str]:
best: dict[str, tuple[int, str]] = {}
for i in range(0, len(iris), batch_size):
batch = iris[i : i + batch_size]
values = " ".join(f"<{u}>" for u in batch)
q = f"""
SELECT ?s ?label
WHERE {{
VALUES ?s {{ {values} }}
?s <{RDFS_LABEL}> ?label .
}}
"""
res = await sparql.query_json(q)
for b in _bindings(res):
s = (b.get("s") or {}).get("value")
label_term = b.get("label") or {}
if not s or label_term.get("type") != "literal":
continue
label_value = label_term.get("value")
if label_value is None:
continue
score = _label_score(label_term)
prev = best.get(s)
if prev is None or score > prev[0]:
best[s] = (score, str(label_value))
return {iri: lbl for iri, (_, lbl) in best.items()}
async def fetch_graph_snapshot(
@@ -28,11 +82,59 @@ async def fetch_graph_snapshot(
)
# Add positions so the frontend doesn't need to run a layout.
xs, ys = spiral_positions(len(nodes))
#
# We are exporting only rdfs:subClassOf triples. In the exported edges:
# source = subclass, target = superclass
# For hierarchical layout we invert edges to:
# superclass -> subclass
hier_edges: list[tuple[int, int]] = []
for e in edges:
s = e.get("source")
t = e.get("target")
try:
sid = int(s) # subclass
tid = int(t) # superclass
except Exception:
continue
hier_edges.append((tid, sid))
try:
layers = level_synchronous_kahn_layers(node_count=len(nodes), edges=hier_edges)
except CycleError as e:
# Add a small URI sample to aid debugging.
sample: list[str] = []
for nid in e.remaining_node_ids[:20]:
try:
sample.append(str(nodes[nid].get("iri")))
except Exception:
continue
raise CycleError(
processed=e.processed,
total=e.total,
remaining_node_ids=e.remaining_node_ids,
remaining_iri_sample=sample or None,
) from None
# Deterministic order within each ring/layer for stable layouts.
id_to_iri = [str(n.get("iri", "")) for n in nodes]
for layer in layers:
layer.sort(key=lambda nid: id_to_iri[nid])
xs, ys = radial_positions_from_layers(node_count=len(nodes), layers=layers)
for i, node in enumerate(nodes):
node["x"] = float(xs[i])
node["y"] = float(ys[i])
# Attach labels for URI nodes (blank nodes remain label-less).
uri_nodes = [n for n in nodes if n.get("termType") == "uri"]
if uri_nodes:
iris = [str(n["iri"]) for n in uri_nodes if isinstance(n.get("iri"), str)]
label_by_iri = await _fetch_rdfs_labels_for_iris(sparql, iris)
for n in uri_nodes:
iri = n.get("iri")
if isinstance(iri, str) and iri in label_by_iri:
n["label"] = label_by_iri[iri]
meta = GraphResponse.Meta(
backend=sparql.name,
ttl_path=settings.ttl_path if settings.graph_backend == "rdflib" else None,

View File

@@ -0,0 +1,141 @@
from __future__ import annotations
import math
from collections import deque
from typing import Iterable, Sequence
class CycleError(RuntimeError):
"""
Raised when the requested layout requires a DAG, but a cycle is detected.
`remaining_node_ids` are the node ids that still had indegree > 0 after Kahn.
"""
def __init__(
self,
*,
processed: int,
total: int,
remaining_node_ids: list[int],
remaining_iri_sample: list[str] | None = None,
) -> None:
self.processed = int(processed)
self.total = int(total)
self.remaining_node_ids = remaining_node_ids
self.remaining_iri_sample = remaining_iri_sample
msg = f"Cycle detected in subClassOf graph (processed {self.processed}/{self.total} nodes)."
if remaining_iri_sample:
msg += f" Example nodes: {', '.join(remaining_iri_sample)}"
super().__init__(msg)
def level_synchronous_kahn_layers(
*,
node_count: int,
edges: Iterable[tuple[int, int]],
) -> list[list[int]]:
"""
Level-synchronous Kahn's algorithm:
- process the entire current queue as one batch (one layer)
- only then enqueue newly-unlocked nodes for the next batch
`edges` are directed (u -> v).
"""
n = int(node_count)
if n <= 0:
return []
adj: list[list[int]] = [[] for _ in range(n)]
indeg = [0] * n
for u, v in edges:
if u == v:
# Self-loops don't help layout and would trivially violate DAG-ness.
continue
if not (0 <= u < n and 0 <= v < n):
continue
adj[u].append(v)
indeg[v] += 1
q: deque[int] = deque(i for i, d in enumerate(indeg) if d == 0)
layers: list[list[int]] = []
processed = 0
while q:
# Consume the full current queue as a single layer.
layer = list(q)
q.clear()
layers.append(layer)
for u in layer:
processed += 1
for v in adj[u]:
indeg[v] -= 1
if indeg[v] == 0:
q.append(v)
if processed != n:
remaining = [i for i, d in enumerate(indeg) if d > 0]
raise CycleError(processed=processed, total=n, remaining_node_ids=remaining)
return layers
def radial_positions_from_layers(
*,
node_count: int,
layers: Sequence[Sequence[int]],
max_r: float = 5000.0,
) -> tuple[list[float], list[float]]:
"""
Assign node positions in concentric rings (one ring per layer).
- radius increases with layer index
- nodes within a layer are placed evenly by angle
- each ring gets a "golden-angle" rotation to reduce spoke artifacts
"""
n = int(node_count)
if n <= 0:
return ([], [])
xs = [0.0] * n
ys = [0.0] * n
if not layers:
return (xs, ys)
two_pi = 2.0 * math.pi
golden = math.pi * (3.0 - math.sqrt(5.0))
layer_count = len(layers)
denom = float(layer_count + 1)
for li, layer in enumerate(layers):
m = len(layer)
if m <= 0:
continue
# Keep everything within ~[-max_r, max_r] like the previous spiral layout.
r = ((li + 1) / denom) * max_r
# Rotate each layer deterministically to avoid radial spokes aligning.
offset = (li * golden) % two_pi
if m == 1:
nid = int(layer[0])
if 0 <= nid < n:
xs[nid] = r * math.cos(offset)
ys[nid] = r * math.sin(offset)
continue
step = two_pi / float(m)
for j, raw_id in enumerate(layer):
nid = int(raw_id)
if not (0 <= nid < n):
continue
t = offset + step * float(j)
xs[nid] = r * math.cos(t)
ys[nid] = r * math.sin(t)
return (xs, ys)

View File

@@ -0,0 +1,96 @@
from __future__ import annotations
import logging
import os
from pathlib import Path
from urllib.parse import unquote, urlparse
from rdflib import Graph
from rdflib.namespace import OWL
logger = logging.getLogger(__name__)
def _is_http_url(location: str) -> bool:
scheme = urlparse(location).scheme.lower()
return scheme in {"http", "https"}
def _is_file_uri(location: str) -> bool:
return urlparse(location).scheme.lower() == "file"
def _file_uri_to_path(location: str) -> Path:
u = urlparse(location)
if u.scheme.lower() != "file":
raise ValueError(f"Not a file:// URI: {location!r}")
return Path(unquote(u.path))
def resolve_output_location(
entry_location: str,
*,
output_location: str | None,
output_name: str,
) -> str:
if output_location:
return output_location
if _is_http_url(entry_location):
raise ValueError(
"COMBINE_ENTRY_LOCATION points to an http(s) URL; set COMBINE_OUTPUT_LOCATION to a writable file path."
)
entry_path = _file_uri_to_path(entry_location) if _is_file_uri(entry_location) else Path(entry_location)
return str(entry_path.parent / output_name)
def _output_destination_to_path(output_location: str) -> Path:
if _is_file_uri(output_location):
return _file_uri_to_path(output_location)
if _is_http_url(output_location):
raise ValueError("Output location must be a local file path (or file:// URI), not http(s).")
return Path(output_location)
def output_location_to_path(output_location: str) -> Path:
return _output_destination_to_path(output_location)
def build_combined_graph(entry_location: str) -> Graph:
"""
Recursively loads an RDF document (file path, file:// URI, or http(s) URL) and its
owl:imports into a single in-memory graph.
"""
combined_graph = Graph()
visited_locations: set[str] = set()
def resolve_imports(location: str) -> None:
if location in visited_locations:
return
visited_locations.add(location)
logger.info("Loading ontology: %s", location)
try:
combined_graph.parse(location=location)
except Exception as e:
logger.warning("Failed to load %s (%s)", location, e)
return
imports = [str(o) for _, _, o in combined_graph.triples((None, OWL.imports, None))]
for imported_location in imports:
if imported_location not in visited_locations:
resolve_imports(imported_location)
resolve_imports(entry_location)
return combined_graph
def serialize_graph_to_ttl(graph: Graph, output_location: str) -> None:
output_path = _output_destination_to_path(output_location)
output_path.parent.mkdir(parents=True, exist_ok=True)
tmp_path = output_path.with_suffix(output_path.suffix + ".tmp")
graph.serialize(destination=str(tmp_path), format="turtle")
os.replace(str(tmp_path), str(output_path))

View File

@@ -0,0 +1,137 @@
from __future__ import annotations
from typing import Any, Iterable
from ..models import GraphResponse, Node
from ..sparql_engine import SparqlEngine
def _values_term(node: Node) -> str | None:
iri = node.iri
if node.termType == "uri":
return f"<{iri}>"
if node.termType == "bnode":
if iri.startswith("_:"):
return iri
return f"_:{iri}"
return None
def selection_neighbors_query(*, selected_nodes: Iterable[Node], include_bnodes: bool) -> str:
values_terms: list[str] = []
for n in selected_nodes:
t = _values_term(n)
if t is None:
continue
values_terms.append(t)
if not values_terms:
# Caller should avoid running this query when selection is empty, but keep this safe.
return "SELECT ?nbr WHERE { FILTER(false) }"
bnode_filter = "" if include_bnodes else "FILTER(!isBlank(?nbr))"
values = " ".join(values_terms)
# Neighbors are defined as any node directly connected by rdf:type (to owl:Class)
# or rdfs:subClassOf, in either direction (treating edges as undirected).
return f"""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT DISTINCT ?nbr
WHERE {{
VALUES ?sel {{ {values} }}
{{
?sel rdf:type ?o .
?o rdf:type owl:Class .
BIND(?o AS ?nbr)
}}
UNION
{{
?s rdf:type ?sel .
?sel rdf:type owl:Class .
BIND(?s AS ?nbr)
}}
UNION
{{
?sel rdfs:subClassOf ?o .
BIND(?o AS ?nbr)
}}
UNION
{{
?s rdfs:subClassOf ?sel .
BIND(?s AS ?nbr)
}}
FILTER(!isLiteral(?nbr))
FILTER(?nbr != ?sel)
{bnode_filter}
}}
"""
def _bindings(res: dict[str, Any]) -> list[dict[str, Any]]:
return (((res.get("results") or {}).get("bindings")) or [])
def _term_key(term: dict[str, Any], *, include_bnodes: bool) -> tuple[str, str] | None:
t = term.get("type")
v = term.get("value")
if not t or v is None:
return None
if t == "literal":
return None
if t == "bnode":
if not include_bnodes:
return None
return ("bnode", f"_:{v}")
return ("uri", str(v))
async def fetch_neighbor_ids_for_selection(
sparql: SparqlEngine,
*,
snapshot: GraphResponse,
selected_ids: list[int],
include_bnodes: bool,
) -> list[int]:
id_to_node: dict[int, Node] = {n.id: n for n in snapshot.nodes}
selected_nodes: list[Node] = []
selected_id_set: set[int] = set()
for nid in selected_ids:
if not isinstance(nid, int):
continue
n = id_to_node.get(nid)
if n is None:
continue
if n.termType == "bnode" and not include_bnodes:
continue
selected_nodes.append(n)
selected_id_set.add(nid)
if not selected_nodes:
return []
key_to_id: dict[tuple[str, str], int] = {}
for n in snapshot.nodes:
key_to_id[(n.termType, n.iri)] = n.id
q = selection_neighbors_query(selected_nodes=selected_nodes, include_bnodes=include_bnodes)
res = await sparql.query_json(q)
neighbor_ids: set[int] = set()
for b in _bindings(res):
nbr_term = b.get("nbr") or {}
key = _term_key(nbr_term, include_bnodes=include_bnodes)
if key is None:
continue
nid = key_to_id.get(key)
if nid is None:
continue
if nid in selected_id_set:
continue
neighbor_ids.add(nid)
# Stable ordering for consistent frontend behavior.
return sorted(neighbor_ids)

View File

@@ -16,6 +16,13 @@ class Settings(BaseSettings):
include_bnodes: bool = Field(default=False, alias="INCLUDE_BNODES")
max_triples: int | None = Field(default=None, alias="MAX_TRIPLES")
# Optional: Combine owl:imports into a single TTL file on backend startup.
combine_owl_imports_on_start: bool = Field(default=False, alias="COMBINE_OWL_IMPORTS_ON_START")
combine_entry_location: str | None = Field(default=None, alias="COMBINE_ENTRY_LOCATION")
combine_output_location: str | None = Field(default=None, alias="COMBINE_OUTPUT_LOCATION")
combine_output_name: str = Field(default="combined_ontology.ttl", alias="COMBINE_OUTPUT_NAME")
combine_force: bool = Field(default=False, alias="COMBINE_FORCE")
# AnzoGraph / SPARQL endpoint configuration
sparql_host: str = Field(default="http://anzograph:8080", alias="SPARQL_HOST")
# If not set, the backend uses `${SPARQL_HOST}/sparql`.

View File

@@ -24,11 +24,13 @@ class SparqlEngine(Protocol):
class RdflibEngine:
name = "rdflib"
def __init__(self, *, ttl_path: str):
def __init__(self, *, ttl_path: str, graph: Graph | None = None):
self.ttl_path = ttl_path
self.graph: Graph | None = None
self.graph: Graph | None = graph
async def startup(self) -> None:
if self.graph is not None:
return
g = Graph()
g.parse(self.ttl_path, format="turtle")
self.graph = g
@@ -167,9 +169,9 @@ class AnzoGraphEngine:
raise RuntimeError(f"AnzoGraph not ready at {self.endpoint}") from last_err
def create_sparql_engine(settings: Settings) -> SparqlEngine:
def create_sparql_engine(settings: Settings, *, rdflib_graph: Graph | None = None) -> SparqlEngine:
if settings.graph_backend == "rdflib":
return RdflibEngine(ttl_path=settings.ttl_path)
return RdflibEngine(ttl_path=settings.ttl_path, graph=rdflib_graph)
if settings.graph_backend == "anzograph":
return AnzoGraphEngine(settings=settings)
raise RuntimeError(f"Unsupported GRAPH_BACKEND={settings.graph_backend!r}")

View File

@@ -21,9 +21,14 @@ services:
- SPARQL_READY_RETRIES=${SPARQL_READY_RETRIES:-30}
- SPARQL_READY_DELAY_S=${SPARQL_READY_DELAY_S:-4}
- SPARQL_READY_TIMEOUT_S=${SPARQL_READY_TIMEOUT_S:-10}
- COMBINE_OWL_IMPORTS_ON_START=${COMBINE_OWL_IMPORTS_ON_START:-false}
- COMBINE_ENTRY_LOCATION
- COMBINE_OUTPUT_LOCATION
- COMBINE_OUTPUT_NAME
- COMBINE_FORCE=${COMBINE_FORCE:-false}
volumes:
- ./backend:/app
- ./data:/data:ro
- ./data:/data:Z
command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/health').read()"]

View File

@@ -1,371 +0,0 @@
# Waiting for AnzoGraph readiness from Julia (how this repo does it)
This repo runs a Julia pipeline (`julia/main.jl`) against an AnzoGraph SPARQL endpoint. The key problem is that **“container started” ≠ “SPARQL endpoint is ready to accept queries”**.
So, before the Julia code does anything that depends on SPARQL (like `LOAD <...>` or large `SELECT`s), it explicitly **waits until AnzoGraph is actually responding to a real SPARQL POST request with valid JSON results**.
This document explains the exact mechanism used here, why it works, and gives copy/paste-ready patterns you can transfer to another project.
---
## 1) Where the waiting happens (pipeline control flow)
In `julia/main.jl`, the entrypoint calls:
```julia
# Step 1: Wait for AnzoGraph
wait_for_anzograph()
# Step 2: Load TTL file
result = sparql_update("LOAD <$SPARQL_DATA_FILE>")
```
So the “await” is not a Julia `Task`/`async` wait; it is a **blocking retry loop** that only returns when it can successfully execute a small SPARQL query.
Reference: `julia/main.jl` defines `wait_for_anzograph()` and calls it from `main()`.
---
## 2) Why this is needed even with Docker Compose `depends_on`
This repos `docker-compose.yml` includes an AnzoGraph `healthcheck`:
```yaml
anzograph:
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:8080/sparql || exit 1"]
interval: 10s
timeout: 5s
retries: 30
start_period: 60s
```
However, `julia-layout` currently depends on `anzograph` with:
```yaml
depends_on:
anzograph:
condition: service_started
```
Meaning:
- Compose will ensure the **container process has started**.
- Compose does **not** guarantee the AnzoGraph HTTP/SPARQL endpoint is ready (unless you use `service_healthy`, and even then a “healthy GET” is not always equivalent to “SPARQL POST works with auth + JSON”).
So the Julia code includes its own readiness gate to prevent failures like:
- TCP connection refused (port not open yet)
- HTTP endpoint reachable but not fully initialized
- Non-JSON/HTML error responses while the service is still booting
---
## 3) What “ready” means in this repo
In this repo, “AnzoGraph is ready” means:
1. An HTTP `POST` to `${SPARQL_HOST}/sparql` succeeds, with headers:
- `Content-Type: application/x-www-form-urlencoded`
- `Accept: application/sparql-results+json`
- `Authorization: Basic ...`
2. The body parses as SPARQL JSON results (`application/sparql-results+json`)
It does **not** strictly mean:
- Your dataset is already loaded
- The loaded data is fully indexed (that can matter in some systems after `LOAD`)
This repo uses readiness as a **“SPARQL endpoint is alive and speaking the protocol”** check.
---
## 4) The actual Julia implementation (as in `julia/main.jl`)
### 4.1 Configuration (endpoint + auth)
The Julia script builds endpoint and auth from environment variables:
```julia
const SPARQL_HOST = get(ENV, "SPARQL_HOST", "http://localhost:8080")
const SPARQL_ENDPOINT = "$SPARQL_HOST/sparql"
const SPARQL_USER = get(ENV, "SPARQL_USER", "admin")
const SPARQL_PASS = get(ENV, "SPARQL_PASS", "Passw0rd1")
const AUTH_HEADER = "Basic " * base64encode("$SPARQL_USER:$SPARQL_PASS")
```
In Docker Compose for this repo, the Julia container overrides `SPARQL_HOST` to use the service DNS name:
```yaml
environment:
- SPARQL_HOST=http://anzograph:8080
```
### 4.2 The smoke query used for readiness
This is the query used in the wait loop:
```julia
const SMOKE_TEST_QUERY = "SELECT ?s ?p ?o WHERE { ?s ?p ?o } LIMIT 3"
```
Notes:
- Its intentionally small (`LIMIT 3`) to keep the readiness check cheap.
- It returns *some* bindings when data exists, but **even an empty dataset can still return a valid empty result set**. The code treats “valid response” as ready.
If you want a readiness check that does not depend on any data being present, an `ASK` query is also common:
```sparql
ASK WHERE { ?s ?p ?o }
```
### 4.3 SPARQL query function (request + minimal retry)
`sparql_query(query; retries=...)` is a generic helper that makes SPARQL POST requests:
```julia
function sparql_query(query::String; retries::Int=5)::SparqlResult
for attempt in 1:retries
try
response = HTTP.post(
SPARQL_ENDPOINT,
[
"Content-Type" => "application/x-www-form-urlencoded",
"Accept" => "application/sparql-results+json",
"Authorization" => AUTH_HEADER
];
body = "query=" * HTTP.URIs.escapeuri(query)
)
if response.status == 200
json = JSON.parse(String(response.body))
return SparqlResult(json["results"]["bindings"])
elseif response.status >= 500 && attempt < retries
sleep(10)
continue
else
error("SPARQL query failed with status $(response.status)")
end
catch e
if attempt < retries
sleep(10)
continue
end
rethrow(e)
end
end
error("SPARQL query failed after $retries attempts")
end
```
Important behaviors to preserve when transferring:
- It uses **POST** (not GET) to the SPARQL endpoint.
- It requires a **200** response and successfully parses SPARQL JSON results.
- It retries on:
- `>= 500` server errors
- network / protocol / parsing errors (caught exceptions)
### 4.4 The readiness gate: `wait_for_anzograph`
This is the “await until ready” logic:
```julia
function wait_for_anzograph(max_retries::Int=30)::Bool
println("Waiting for AnzoGraph at $SPARQL_ENDPOINT...")
for attempt in 1:max_retries
try
smoke_result = sparql_query(SMOKE_TEST_QUERY; retries=1)
println(" AnzoGraph is ready (attempt $attempt, smoke rows=$(length(smoke_result.bindings)))")
return true
catch e
println(" Attempt $attempt/$max_retries: $(typeof(e))")
sleep(4)
end
end
error("AnzoGraph not available after $max_retries attempts")
end
```
Why it calls `sparql_query(...; retries=1)`:
- It makes each outer “readiness attempt” a **single** request.
- The outer loop controls cadence (`sleep(4)`) and total wait time.
- This avoids “nested retry loops” (inner sleeps + outer sleeps) that can make waits much longer than intended.
Time bound in the current implementation:
- `max_retries = 30`
- `sleep(4)` between attempts
- Roughly ~120 seconds of waiting (plus request time).
---
## 5) What failures cause it to keep waiting
`wait_for_anzograph()` catches any exception thrown by `sparql_query()` and retries. In practice, that includes:
- **Connection errors** (DNS not ready, connection refused, etc.)
- **Timeouts** (if HTTP request takes too long and the library throws)
- **Non-200 HTTP statuses** that cause `error(...)`
- **Non-JSON / unexpected JSON** responses causing `JSON.parse(...)` to throw
That last point is a big reason a “real SPARQL request + parse” is stronger than just “ping the port”.
---
## 6) Transferable, self-contained version (recommended pattern)
If you want to reuse this in another project, its usually easier to:
- avoid globals,
- make endpoint/auth explicit,
- use a **time-based timeout** instead of `max_retries` (more robust),
- add request timeouts so the wait loop cant hang forever on a single request.
Below is a drop-in module you can copy into your project.
```julia
module AnzoGraphReady
using HTTP
using JSON
using Base64
using Dates
struct SparqlResult
bindings::Vector{Dict{String, Any}}
end
function basic_auth_header(user::AbstractString, pass::AbstractString)::String
return "Basic " * base64encode("$(user):$(pass)")
end
function sparql_query(
endpoint::AbstractString,
auth_header::AbstractString,
query::AbstractString;
retries::Int = 1,
retry_sleep_s::Real = 2,
request_timeout_s::Real = 15,
)::SparqlResult
for attempt in 1:retries
try
response = HTTP.post(
String(endpoint),
[
"Content-Type" => "application/x-www-form-urlencoded",
"Accept" => "application/sparql-results+json",
"Authorization" => auth_header,
];
body = "query=" * HTTP.URIs.escapeuri(String(query)),
readtimeout = request_timeout_s,
)
if response.status != 200
error("SPARQL query failed with status $(response.status)")
end
parsed = JSON.parse(String(response.body))
bindings = get(get(parsed, "results", Dict()), "bindings", Any[])
return SparqlResult(Vector{Dict{String, Any}}(bindings))
catch e
if attempt < retries
sleep(retry_sleep_s)
continue
end
rethrow(e)
end
end
error("sparql_query: unreachable")
end
"""
Wait until AnzoGraph responds to a real SPARQL POST with parseable JSON.
This is the direct analog of this repo's `wait_for_anzograph()`, but with:
- a time-based timeout (`timeout`)
- a request timeout per attempt (`request_timeout_s`)
- simple exponential backoff
"""
function wait_for_anzograph(
endpoint::AbstractString,
auth_header::AbstractString;
timeout::Period = Minute(3),
initial_delay_s::Real = 0.5,
max_delay_s::Real = 5.0,
request_timeout_s::Real = 10.0,
query::AbstractString = "ASK WHERE { ?s ?p ?o }",
)::Nothing
deadline = now() + timeout
delay_s = initial_delay_s
while now() < deadline
try
# A single attempt: if it succeeds, we declare "ready".
sparql_query(
endpoint,
auth_header,
query;
retries = 1,
request_timeout_s = request_timeout_s,
)
return
catch
sleep(delay_s)
delay_s = min(max_delay_s, delay_s * 1.5)
end
end
error("AnzoGraph not available before timeout=$(timeout)")
end
end # module
```
Typical usage (matching this repos environment variables):
```julia
using .AnzoGraphReady
sparql_host = get(ENV, "SPARQL_HOST", "http://localhost:8080")
endpoint = "$(sparql_host)/sparql"
user = get(ENV, "SPARQL_USER", "admin")
pass = get(ENV, "SPARQL_PASS", "Passw0rd1")
auth = AnzoGraphReady.basic_auth_header(user, pass)
AnzoGraphReady.wait_for_anzograph(endpoint, auth; timeout=Minute(5))
# Now it is safe to LOAD / query.
```
---
## 7) Optional: waiting for “data is ready” after `LOAD`
Some systems accept `LOAD` but need time before results show up reliably (indexing / transaction visibility).
If you run into that in your other project, add a second gate after `LOAD`, for example:
1) load, then
2) poll a query that must be true after load (e.g., “triple count > 0”, or a known IRI exists).
Example “post-load gate”:
```julia
post_load_query = """
SELECT (COUNT(*) AS ?n)
WHERE { ?s ?p ?o }
"""
res = AnzoGraphReady.sparql_query(endpoint, auth, post_load_query; retries=1)
# Parse `?n` out of bindings and require it to be > 0; retry until it is.
```
(This repo does not currently enforce “non-empty”; it only enforces “SPARQL is working”.)
---
## 8) Practical checklist when transferring to another project
- Make readiness checks hit the **real SPARQL POST** path you will use in production.
- Require a **valid JSON parse**, not just “port open”.
- Add **per-request timeouts**, so a single hung request cannot hang the whole pipeline.
- Prefer **time-based overall timeout** for predictable behavior in CI.
- Keep the query **cheap** (`ASK` or `LIMIT 1/3`).
- If you use Docker Compose healthchecks, consider also using `depends_on: condition: service_healthy`, but still keep the in-app wait as a safety net (its closer to the real contract your code needs).

View File

@@ -5,6 +5,17 @@ function sleep(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms));
}
type GraphMeta = {
backend?: string;
ttl_path?: string | null;
sparql_endpoint?: string | null;
include_bnodes?: boolean;
node_limit?: number;
edge_limit?: number;
nodes?: number;
edges?: number;
};
export default function App() {
const canvasRef = useRef<HTMLCanvasElement>(null);
const rendererRef = useRef<Renderer | null>(null);
@@ -18,12 +29,15 @@ export default function App() {
ptSize: 0,
});
const [error, setError] = useState("");
const [hoveredNode, setHoveredNode] = useState<{ x: number; y: number; screenX: number; screenY: number } | null>(null);
const [hoveredNode, setHoveredNode] = useState<{ x: number; y: number; screenX: number; screenY: number; label?: string; iri?: string } | null>(null);
const [selectedNodes, setSelectedNodes] = useState<Set<number>>(new Set());
const [backendStats, setBackendStats] = useState<{ nodes: number; edges: number; backend?: string } | null>(null);
const graphMetaRef = useRef<GraphMeta | null>(null);
const neighborsReqIdRef = useRef(0);
// Store mouse position in a ref so it can be accessed in render loop without re-renders
const mousePos = useRef({ x: 0, y: 0 });
const nodesRef = useRef<any[]>([]);
useEffect(() => {
const canvas = canvasRef.current;
@@ -70,6 +84,9 @@ export default function App() {
const meta = graph.meta || null;
const count = nodes.length;
nodesRef.current = nodes;
graphMetaRef.current = meta && typeof meta === "object" ? (meta as GraphMeta) : null;
// Build positions from backend-provided node coordinates.
setStatus("Preparing buffers…");
const xs = new Float32Array(count);
@@ -196,9 +213,18 @@ export default function App() {
frameCount++;
// Find hovered node using quadtree
const node = renderer.findNodeAt(mousePos.current.x, mousePos.current.y);
if (node) {
setHoveredNode({ ...node, screenX: mousePos.current.x, screenY: mousePos.current.y });
const hit = renderer.findNodeIndexAt(mousePos.current.x, mousePos.current.y);
if (hit) {
const origIdx = renderer.sortedIndexToOriginalIndex(hit.index);
const meta = origIdx === null ? null : nodesRef.current[origIdx];
setHoveredNode({
x: hit.x,
y: hit.y,
screenX: mousePos.current.x,
screenY: mousePos.current.y,
label: meta && typeof meta.label === "string" ? meta.label : undefined,
iri: meta && typeof meta.iri === "string" ? meta.iri : undefined,
});
} else {
setHoveredNode(null);
}
@@ -234,9 +260,72 @@ export default function App() {
// Sync selection state to renderer
useEffect(() => {
if (rendererRef.current) {
rendererRef.current.updateSelection(selectedNodes);
const renderer = rendererRef.current;
if (!renderer) return;
// Optimistically reflect selection immediately; neighbors will be filled in by backend.
renderer.updateSelection(selectedNodes, new Set());
// Invalidate any in-flight neighbor request for the previous selection.
const reqId = ++neighborsReqIdRef.current;
// Convert selected sorted indices to backend node IDs (graph-export dense IDs).
const selectedIds: number[] = [];
for (const sortedIdx of selectedNodes) {
const origIdx = renderer.sortedIndexToOriginalIndex(sortedIdx);
if (origIdx === null) continue;
const nodeId = nodesRef.current?.[origIdx]?.id;
if (typeof nodeId === "number") selectedIds.push(nodeId);
}
if (selectedIds.length === 0) {
return;
}
// Always send the full current selection list; backend returns the merged neighbor set.
const ctrl = new AbortController();
(async () => {
try {
const meta = graphMetaRef.current;
const body = {
selected_ids: selectedIds,
node_limit: typeof meta?.node_limit === "number" ? meta.node_limit : undefined,
edge_limit: typeof meta?.edge_limit === "number" ? meta.edge_limit : undefined,
};
const res = await fetch("/api/neighbors", {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify(body),
signal: ctrl.signal,
});
if (!res.ok) throw new Error(`POST /api/neighbors failed: ${res.status}`);
const data = await res.json();
if (ctrl.signal.aborted) return;
if (reqId !== neighborsReqIdRef.current) return;
const neighborIds: unknown = data?.neighbor_ids;
const neighborSorted = new Set<number>();
if (Array.isArray(neighborIds)) {
for (const id of neighborIds) {
if (typeof id !== "number") continue;
const sorted = renderer.vertexIdToSortedIndexOrNull(id);
if (sorted === null) continue;
if (!selectedNodes.has(sorted)) neighborSorted.add(sorted);
}
}
renderer.updateSelection(selectedNodes, neighborSorted);
} catch (e) {
if (ctrl.signal.aborted) return;
console.warn(e);
// Keep the UI usable even if neighbors fail to load.
renderer.updateSelection(selectedNodes, new Set());
}
})();
return () => ctrl.abort();
}, [selectedNodes]);
return (
@@ -350,7 +439,12 @@ export default function App() {
boxShadow: "0 2px 8px rgba(0,0,0,0.5)",
}}
>
({hoveredNode.x.toFixed(2)}, {hoveredNode.y.toFixed(2)})
<div style={{ color: "#0ff" }}>
{hoveredNode.label || hoveredNode.iri || "(unknown)"}
</div>
<div style={{ color: "#688" }}>
({hoveredNode.x.toFixed(2)}, {hoveredNode.y.toFixed(2)})
</div>
</div>
)}
</>

View File

@@ -80,9 +80,11 @@ export class Renderer {
// Data
private leaves: Leaf[] = [];
private sorted: Float32Array = new Float32Array(0);
// order[sortedIdx] = originalIdx (original ordering matches input arrays)
private sortedToOriginal: Uint32Array = new Uint32Array(0);
private vertexIdToSortedIndex: Map<number, number> = new Map();
private nodeCount = 0;
private edgeCount = 0;
private neighborMap: Map<number, number[]> = new Map();
private leafEdgeStarts: Uint32Array = new Uint32Array(0);
private leafEdgeCounts: Uint32Array = new Uint32Array(0);
private maxPtSize = 256;
@@ -202,6 +204,7 @@ export class Renderer {
const { sorted, leaves, order } = buildSpatialIndex(xs, ys);
this.leaves = leaves;
this.sorted = sorted;
this.sortedToOriginal = order;
// Pre-allocate arrays for render loop (zero-allocation rendering)
this.visibleLeafIndices = new Uint32Array(leaves.length);
@@ -226,6 +229,13 @@ export class Renderer {
originalToSorted[order[i]] = i;
}
// Build vertex ID → sorted index mapping (used by backend-driven neighbor highlighting)
const vertexIdToSortedIndex = new Map<number, number>();
for (let i = 0; i < count; i++) {
vertexIdToSortedIndex.set(vertexIds[i], originalToSorted[i]);
}
this.vertexIdToSortedIndex = vertexIdToSortedIndex;
// Remap edges from vertex IDs to sorted indices
const lineIndices = new Uint32Array(edgeCount * 2);
let validEdges = 0;
@@ -241,18 +251,6 @@ export class Renderer {
}
this.edgeCount = validEdges;
// Build per-node neighbor list from edges for selection queries
const neighborMap = new Map<number, number[]>();
for (let i = 0; i < validEdges; i++) {
const src = lineIndices[i * 2];
const dst = lineIndices[i * 2 + 1];
if (!neighborMap.has(src)) neighborMap.set(src, []);
neighborMap.get(src)!.push(dst);
if (!neighborMap.has(dst)) neighborMap.set(dst, []);
neighborMap.get(dst)!.push(src);
}
this.neighborMap = neighborMap;
// Build per-leaf edge index for efficient visible-only edge drawing
// Find which leaf each sorted index belongs to
const nodeToLeaf = new Uint32Array(count);
@@ -331,6 +329,28 @@ export class Renderer {
return this.nodeCount;
}
/**
* Map a sorted buffer index (what findNodeIndexAt returns) back to the original
* index in the input arrays used to initialize the renderer.
*/
sortedIndexToOriginalIndex(sortedIndex: number): number | null {
if (
sortedIndex < 0 ||
sortedIndex >= this.sortedToOriginal.length
) {
return null;
}
return this.sortedToOriginal[sortedIndex];
}
/**
* Convert a backend node ID (node.id from /api/graph) to a sorted index used by the renderer.
*/
vertexIdToSortedIndexOrNull(vertexId: number): number | null {
const idx = this.vertexIdToSortedIndex.get(vertexId);
return typeof idx === "number" ? idx : null;
}
/**
* Convert screen coordinates (CSS pixels) to world coordinates.
*/
@@ -412,10 +432,10 @@ export class Renderer {
/**
* Update the selection buffer with the given set of node indices.
* Also computes neighbors of selected nodes.
* Call this whenever React's selection state changes.
* Neighbor indices are provided by the backend (SPARQL query) and uploaded separately.
* Call this whenever selection or backend neighbor results change.
*/
updateSelection(selectedIndices: Set<number>): void {
updateSelection(selectedIndices: Set<number>, neighborIndices: Set<number> = new Set()): void {
const gl = this.gl;
// Upload selected indices
@@ -425,23 +445,11 @@ export class Renderer {
gl.bufferData(gl.ELEMENT_ARRAY_BUFFER, indices, gl.DYNAMIC_DRAW);
gl.bindBuffer(gl.ELEMENT_ARRAY_BUFFER, null);
// Compute neighbors of selected nodes (excluding already selected)
const neighborSet = new Set<number>();
for (const nodeIdx of selectedIndices) {
const nodeNeighbors = this.neighborMap.get(nodeIdx);
if (!nodeNeighbors) continue;
for (const n of nodeNeighbors) {
if (!selectedIndices.has(n)) {
neighborSet.add(n);
}
}
}
// Upload neighbor indices
const neighborIndices = new Uint32Array(neighborSet);
this.neighborCount = neighborIndices.length;
const neighborIndexArray = new Uint32Array(neighborIndices);
this.neighborCount = neighborIndexArray.length;
gl.bindBuffer(gl.ELEMENT_ARRAY_BUFFER, this.neighborIbo);
gl.bufferData(gl.ELEMENT_ARRAY_BUFFER, neighborIndices, gl.DYNAMIC_DRAW);
gl.bufferData(gl.ELEMENT_ARRAY_BUFFER, neighborIndexArray, gl.DYNAMIC_DRAW);
gl.bindBuffer(gl.ELEMENT_ARRAY_BUFFER, null);
}