Graph access via SPARQL
This commit is contained in:
94
backend/app/graph_export.py
Normal file
94
backend/app/graph_export.py
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def edge_retrieval_query(*, edge_limit: int, include_bnodes: bool) -> str:
|
||||||
|
bnode_filter = "" if include_bnodes else "FILTER(!isBlank(?s) && !isBlank(?o))"
|
||||||
|
return f"""
|
||||||
|
SELECT ?s ?p ?o
|
||||||
|
WHERE {{
|
||||||
|
?s ?p ?o .
|
||||||
|
FILTER(!isLiteral(?o))
|
||||||
|
FILTER(?p NOT IN (
|
||||||
|
<http://www.w3.org/2000/01/rdf-schema#label>,
|
||||||
|
<http://www.w3.org/2004/02/skos/core#prefLabel>,
|
||||||
|
<http://www.w3.org/2004/02/skos/core#altLabel>
|
||||||
|
))
|
||||||
|
{bnode_filter}
|
||||||
|
}}
|
||||||
|
LIMIT {edge_limit}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def graph_from_sparql_bindings(
|
||||||
|
bindings: list[dict[str, Any]],
|
||||||
|
*,
|
||||||
|
node_limit: int,
|
||||||
|
include_bnodes: bool,
|
||||||
|
) -> tuple[list[dict[str, object]], list[dict[str, object]]]:
|
||||||
|
"""
|
||||||
|
Convert SPARQL JSON results bindings into:
|
||||||
|
nodes: [{id, termType, iri, label}]
|
||||||
|
edges: [{source, target, predicate}]
|
||||||
|
|
||||||
|
IDs are assigned densely (0..N-1) based on first occurrence in bindings.
|
||||||
|
"""
|
||||||
|
|
||||||
|
node_id_by_key: dict[tuple[str, str], int] = {}
|
||||||
|
node_meta: list[tuple[str, str]] = [] # (termType, iri)
|
||||||
|
out_edges: list[dict[str, object]] = []
|
||||||
|
|
||||||
|
def term_to_key_and_iri(term: dict[str, Any]) -> tuple[tuple[str, str], tuple[str, str]] | None:
|
||||||
|
t = term.get("type")
|
||||||
|
v = term.get("value")
|
||||||
|
if not t or v is None:
|
||||||
|
return None
|
||||||
|
if t == "literal":
|
||||||
|
return None
|
||||||
|
if t == "bnode":
|
||||||
|
if not include_bnodes:
|
||||||
|
return None
|
||||||
|
# SPARQL JSON uses bnode identifiers without the "_:" prefix; we normalize to "_:id".
|
||||||
|
return (("bnode", str(v)), ("bnode", f"_:{v}"))
|
||||||
|
# Default to "uri".
|
||||||
|
return (("uri", str(v)), ("uri", str(v)))
|
||||||
|
|
||||||
|
def get_or_add(term: dict[str, Any]) -> int | None:
|
||||||
|
out = term_to_key_and_iri(term)
|
||||||
|
if out is None:
|
||||||
|
return None
|
||||||
|
key, meta = out
|
||||||
|
existing = node_id_by_key.get(key)
|
||||||
|
if existing is not None:
|
||||||
|
return existing
|
||||||
|
if len(node_meta) >= node_limit:
|
||||||
|
return None
|
||||||
|
nid = len(node_meta)
|
||||||
|
node_id_by_key[key] = nid
|
||||||
|
node_meta.append(meta)
|
||||||
|
return nid
|
||||||
|
|
||||||
|
for b in bindings:
|
||||||
|
s_term = b.get("s") or {}
|
||||||
|
o_term = b.get("o") or {}
|
||||||
|
p_term = b.get("p") or {}
|
||||||
|
|
||||||
|
sid = get_or_add(s_term)
|
||||||
|
oid = get_or_add(o_term)
|
||||||
|
if sid is None or oid is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
pred = p_term.get("value")
|
||||||
|
if not pred:
|
||||||
|
continue
|
||||||
|
|
||||||
|
out_edges.append({"source": sid, "target": oid, "predicate": str(pred)})
|
||||||
|
|
||||||
|
out_nodes = [
|
||||||
|
{"id": i, "termType": term_type, "iri": iri, "label": None}
|
||||||
|
for i, (term_type, iri) in enumerate(node_meta)
|
||||||
|
]
|
||||||
|
|
||||||
|
return out_nodes, out_edges
|
||||||
|
|
||||||
@@ -5,6 +5,7 @@ from contextlib import asynccontextmanager
|
|||||||
from fastapi import FastAPI, HTTPException, Query
|
from fastapi import FastAPI, HTTPException, Query
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
|
from .graph_export import edge_retrieval_query, graph_from_sparql_bindings
|
||||||
from .models import EdgesResponse, GraphResponse, NodesResponse, SparqlQueryRequest, StatsResponse
|
from .models import EdgesResponse, GraphResponse, NodesResponse, SparqlQueryRequest, StatsResponse
|
||||||
from .rdf_store import RDFStore
|
from .rdf_store import RDFStore
|
||||||
from .sparql_engine import AnzoGraphEngine, RdflibEngine, SparqlEngine, create_sparql_engine
|
from .sparql_engine import AnzoGraphEngine, RdflibEngine, SparqlEngine, create_sparql_engine
|
||||||
@@ -161,87 +162,13 @@ async def graph(
|
|||||||
) -> GraphResponse:
|
) -> GraphResponse:
|
||||||
sparql: SparqlEngine = app.state.sparql
|
sparql: SparqlEngine = app.state.sparql
|
||||||
|
|
||||||
if settings.graph_backend == "rdflib":
|
# Use SPARQL for graph export in BOTH modes so callers don't care which backend is in use.
|
||||||
store: RDFStore = app.state.store
|
edges_q = edge_retrieval_query(edge_limit=edge_limit, include_bnodes=settings.include_bnodes)
|
||||||
return GraphResponse(
|
|
||||||
nodes=store.node_slice(offset=0, limit=node_limit),
|
|
||||||
edges=store.edge_slice(offset=0, limit=edge_limit),
|
|
||||||
)
|
|
||||||
|
|
||||||
# AnzoGraph mode: return a simple subgraph by pulling the first N triples.
|
|
||||||
assert isinstance(sparql, AnzoGraphEngine)
|
|
||||||
|
|
||||||
edges_bnode_filter = "" if settings.include_bnodes else "FILTER(!isBlank(?s) && !isBlank(?o))"
|
|
||||||
edges_q = f"""
|
|
||||||
SELECT ?s ?p ?o
|
|
||||||
WHERE {{
|
|
||||||
?s ?p ?o .
|
|
||||||
FILTER(!isLiteral(?o))
|
|
||||||
FILTER(?p NOT IN (
|
|
||||||
<http://www.w3.org/2000/01/rdf-schema#label>,
|
|
||||||
<http://www.w3.org/2004/02/skos/core#prefLabel>,
|
|
||||||
<http://www.w3.org/2004/02/skos/core#altLabel>
|
|
||||||
))
|
|
||||||
{edges_bnode_filter}
|
|
||||||
}}
|
|
||||||
LIMIT {edge_limit}
|
|
||||||
"""
|
|
||||||
|
|
||||||
res = await sparql.query_json(edges_q)
|
res = await sparql.query_json(edges_q)
|
||||||
bindings = (((res.get("results") or {}).get("bindings")) or [])
|
bindings = (((res.get("results") or {}).get("bindings")) or [])
|
||||||
|
nodes, edges = graph_from_sparql_bindings(
|
||||||
node_id_by_key: dict[tuple[str, str], int] = {}
|
bindings,
|
||||||
node_meta: list[tuple[str, str]] = [] # (termType, iri)
|
node_limit=node_limit,
|
||||||
out_edges: list[dict[str, object]] = []
|
include_bnodes=settings.include_bnodes,
|
||||||
|
)
|
||||||
def _term_to_key_and_iri(term: dict[str, str]) -> tuple[tuple[str, str], tuple[str, str]] | None:
|
return GraphResponse(nodes=nodes, edges=edges)
|
||||||
t = term.get("type")
|
|
||||||
v = term.get("value")
|
|
||||||
if not t or v is None:
|
|
||||||
return None
|
|
||||||
if t == "literal":
|
|
||||||
return None
|
|
||||||
if t == "bnode" and not settings.include_bnodes:
|
|
||||||
return None
|
|
||||||
if t == "bnode":
|
|
||||||
return (("bnode", v), ("bnode", f"_:{v}"))
|
|
||||||
# Default to "uri".
|
|
||||||
return (("uri", v), ("uri", v))
|
|
||||||
|
|
||||||
def _get_or_add(term: dict[str, str]) -> int | None:
|
|
||||||
out = _term_to_key_and_iri(term)
|
|
||||||
if out is None:
|
|
||||||
return None
|
|
||||||
key, meta = out
|
|
||||||
existing = node_id_by_key.get(key)
|
|
||||||
if existing is not None:
|
|
||||||
return existing
|
|
||||||
if len(node_meta) >= node_limit:
|
|
||||||
return None
|
|
||||||
nid = len(node_meta)
|
|
||||||
node_id_by_key[key] = nid
|
|
||||||
node_meta.append(meta)
|
|
||||||
return nid
|
|
||||||
|
|
||||||
for b in bindings:
|
|
||||||
s_term = b.get("s") or {}
|
|
||||||
o_term = b.get("o") or {}
|
|
||||||
p_term = b.get("p") or {}
|
|
||||||
|
|
||||||
sid = _get_or_add(s_term)
|
|
||||||
oid = _get_or_add(o_term)
|
|
||||||
if sid is None or oid is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
pred = p_term.get("value")
|
|
||||||
if not pred:
|
|
||||||
continue
|
|
||||||
|
|
||||||
out_edges.append({"source": sid, "target": oid, "predicate": pred})
|
|
||||||
|
|
||||||
out_nodes = [
|
|
||||||
{"id": i, "termType": term_type, "iri": iri, "label": None}
|
|
||||||
for i, (term_type, iri) in enumerate(node_meta)
|
|
||||||
]
|
|
||||||
|
|
||||||
return GraphResponse(nodes=out_nodes, edges=out_edges)
|
|
||||||
|
|||||||
1
backend/app/pipelines/__init__.py
Normal file
1
backend/app/pipelines/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
|
||||||
153
backend/app/pipelines/subclass_labels.py
Normal file
153
backend/app/pipelines/subclass_labels.py
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from ..sparql_engine import SparqlEngine
|
||||||
|
|
||||||
|
RDFS_SUBCLASS_OF = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
|
||||||
|
RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
|
||||||
|
|
||||||
|
|
||||||
|
def _bindings(res: dict[str, Any]) -> list[dict[str, Any]]:
|
||||||
|
return (((res.get("results") or {}).get("bindings")) or [])
|
||||||
|
|
||||||
|
|
||||||
|
def _term_key(term: dict[str, Any]) -> tuple[str, str] | None:
|
||||||
|
t = term.get("type")
|
||||||
|
v = term.get("value")
|
||||||
|
if not t or v is None:
|
||||||
|
return None
|
||||||
|
if t == "literal":
|
||||||
|
return None
|
||||||
|
if t == "bnode":
|
||||||
|
return ("bnode", str(v))
|
||||||
|
return ("uri", str(v))
|
||||||
|
|
||||||
|
|
||||||
|
def _key_to_entity_string(key: tuple[str, str]) -> str:
|
||||||
|
t, v = key
|
||||||
|
if t == "bnode":
|
||||||
|
return f"_:{v}"
|
||||||
|
return v
|
||||||
|
|
||||||
|
|
||||||
|
def _label_score(binding: dict[str, Any]) -> int:
|
||||||
|
"""
|
||||||
|
Higher is better.
|
||||||
|
Prefer English, then no-language, then anything else.
|
||||||
|
"""
|
||||||
|
lang = (binding.get("xml:lang") or "").lower()
|
||||||
|
if lang == "en":
|
||||||
|
return 3
|
||||||
|
if lang == "":
|
||||||
|
return 2
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_subclass_entities_and_labels(
|
||||||
|
sparql: SparqlEngine,
|
||||||
|
*,
|
||||||
|
include_bnodes: bool,
|
||||||
|
label_batch_size: int = 500,
|
||||||
|
) -> tuple[list[str], list[str | None]]:
|
||||||
|
"""
|
||||||
|
Pipeline:
|
||||||
|
1) Query all rdfs:subClassOf triples.
|
||||||
|
2) Build a unique set of entity terms from subjects+objects, convert to list.
|
||||||
|
3) Fetch rdfs:label for those entities and return an aligned labels list.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
entities: list[str] (IRI or "_:bnodeId")
|
||||||
|
labels: list[str|None], aligned with entities
|
||||||
|
"""
|
||||||
|
|
||||||
|
subclass_q = f"""
|
||||||
|
SELECT ?s ?o
|
||||||
|
WHERE {{
|
||||||
|
?s <{RDFS_SUBCLASS_OF}> ?o .
|
||||||
|
FILTER(!isLiteral(?o))
|
||||||
|
{"FILTER(!isBlank(?s) && !isBlank(?o))" if not include_bnodes else ""}
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
res = await sparql.query_json(subclass_q)
|
||||||
|
|
||||||
|
entity_keys: set[tuple[str, str]] = set()
|
||||||
|
for b in _bindings(res):
|
||||||
|
sk = _term_key(b.get("s") or {})
|
||||||
|
ok = _term_key(b.get("o") or {})
|
||||||
|
if sk is not None and (include_bnodes or sk[0] != "bnode"):
|
||||||
|
entity_keys.add(sk)
|
||||||
|
if ok is not None and (include_bnodes or ok[0] != "bnode"):
|
||||||
|
entity_keys.add(ok)
|
||||||
|
|
||||||
|
# Deterministic ordering.
|
||||||
|
entity_key_list = sorted(entity_keys, key=lambda k: (k[0], k[1]))
|
||||||
|
entities = [_key_to_entity_string(k) for k in entity_key_list]
|
||||||
|
|
||||||
|
# Build label map keyed by term key.
|
||||||
|
best_label_by_key: dict[tuple[str, str], tuple[int, str]] = {}
|
||||||
|
|
||||||
|
# URIs can be batch-queried via VALUES.
|
||||||
|
uri_values = [v for (t, v) in entity_key_list if t == "uri"]
|
||||||
|
for i in range(0, len(uri_values), label_batch_size):
|
||||||
|
batch = uri_values[i : i + label_batch_size]
|
||||||
|
values = " ".join(f"<{u}>" for u in batch)
|
||||||
|
labels_q = f"""
|
||||||
|
SELECT ?s ?label
|
||||||
|
WHERE {{
|
||||||
|
VALUES ?s {{ {values} }}
|
||||||
|
?s <{RDFS_LABEL}> ?label .
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
lres = await sparql.query_json(labels_q)
|
||||||
|
for b in _bindings(lres):
|
||||||
|
sk = _term_key(b.get("s") or {})
|
||||||
|
if sk is None or sk[0] != "uri":
|
||||||
|
continue
|
||||||
|
label_term = b.get("label") or {}
|
||||||
|
if label_term.get("type") != "literal":
|
||||||
|
continue
|
||||||
|
label_value = label_term.get("value")
|
||||||
|
if label_value is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
score = _label_score(label_term)
|
||||||
|
prev = best_label_by_key.get(sk)
|
||||||
|
if prev is None or score > prev[0]:
|
||||||
|
best_label_by_key[sk] = (score, str(label_value))
|
||||||
|
|
||||||
|
# Blank nodes can't reliably be addressed by ID across queries, but if enabled we can still
|
||||||
|
# fetch all bnode labels and filter locally.
|
||||||
|
if include_bnodes:
|
||||||
|
bnode_keys = {k for k in entity_key_list if k[0] == "bnode"}
|
||||||
|
if bnode_keys:
|
||||||
|
bnode_labels_q = f"""
|
||||||
|
SELECT ?s ?label
|
||||||
|
WHERE {{
|
||||||
|
?s <{RDFS_LABEL}> ?label .
|
||||||
|
FILTER(isBlank(?s))
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
blres = await sparql.query_json(bnode_labels_q)
|
||||||
|
for b in _bindings(blres):
|
||||||
|
sk = _term_key(b.get("s") or {})
|
||||||
|
if sk is None or sk not in bnode_keys:
|
||||||
|
continue
|
||||||
|
label_term = b.get("label") or {}
|
||||||
|
if label_term.get("type") != "literal":
|
||||||
|
continue
|
||||||
|
label_value = label_term.get("value")
|
||||||
|
if label_value is None:
|
||||||
|
continue
|
||||||
|
score = _label_score(label_term)
|
||||||
|
prev = best_label_by_key.get(sk)
|
||||||
|
if prev is None or score > prev[0]:
|
||||||
|
best_label_by_key[sk] = (score, str(label_value))
|
||||||
|
|
||||||
|
labels: list[str | None] = []
|
||||||
|
for k in entity_key_list:
|
||||||
|
item = best_label_by_key.get(k)
|
||||||
|
labels.append(item[1] if item else None)
|
||||||
|
|
||||||
|
return entities, labels
|
||||||
|
|
||||||
@@ -132,3 +132,19 @@ class RDFStore:
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
def edges_within_nodes(self, *, max_node_id_exclusive: int, limit: int) -> list[dict[str, Any]]:
|
||||||
|
out: list[dict[str, Any]] = []
|
||||||
|
for row in self._edges:
|
||||||
|
if row.source >= max_node_id_exclusive or row.target >= max_node_id_exclusive:
|
||||||
|
continue
|
||||||
|
out.append(
|
||||||
|
{
|
||||||
|
"source": row.source,
|
||||||
|
"target": row.target,
|
||||||
|
"predicate": row.predicate,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if len(out) >= limit:
|
||||||
|
break
|
||||||
|
return out
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ class Settings(BaseSettings):
|
|||||||
sparql_timeout_s: float = Field(default=300.0, alias="SPARQL_TIMEOUT_S")
|
sparql_timeout_s: float = Field(default=300.0, alias="SPARQL_TIMEOUT_S")
|
||||||
sparql_ready_retries: int = Field(default=30, alias="SPARQL_READY_RETRIES")
|
sparql_ready_retries: int = Field(default=30, alias="SPARQL_READY_RETRIES")
|
||||||
sparql_ready_delay_s: float = Field(default=4.0, alias="SPARQL_READY_DELAY_S")
|
sparql_ready_delay_s: float = Field(default=4.0, alias="SPARQL_READY_DELAY_S")
|
||||||
|
sparql_ready_timeout_s: float = Field(default=10.0, alias="SPARQL_READY_TIMEOUT_S")
|
||||||
|
|
||||||
# Comma-separated, or "*" (default).
|
# Comma-separated, or "*" (default).
|
||||||
cors_origins: str = Field(default="*", alias="CORS_ORIGINS")
|
cors_origins: str = Field(default="*", alias="CORS_ORIGINS")
|
||||||
|
|||||||
@@ -56,6 +56,7 @@ class AnzoGraphEngine:
|
|||||||
self.timeout_s = settings.sparql_timeout_s
|
self.timeout_s = settings.sparql_timeout_s
|
||||||
self.ready_retries = settings.sparql_ready_retries
|
self.ready_retries = settings.sparql_ready_retries
|
||||||
self.ready_delay_s = settings.sparql_ready_delay_s
|
self.ready_delay_s = settings.sparql_ready_delay_s
|
||||||
|
self.ready_timeout_s = settings.sparql_ready_timeout_s
|
||||||
|
|
||||||
self.user = settings.sparql_user
|
self.user = settings.sparql_user
|
||||||
self.password = settings.sparql_pass
|
self.password = settings.sparql_pass
|
||||||
@@ -135,15 +136,34 @@ class AnzoGraphEngine:
|
|||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|
||||||
async def _wait_ready(self) -> None:
|
async def _wait_ready(self) -> None:
|
||||||
|
if self._client is None:
|
||||||
|
raise RuntimeError("AnzoGraphEngine not started")
|
||||||
|
|
||||||
|
# Match the repo's Julia readiness gate: real SPARQL POST + valid JSON parse.
|
||||||
|
headers = {
|
||||||
|
"Content-Type": "application/x-www-form-urlencoded",
|
||||||
|
"Accept": "application/sparql-results+json",
|
||||||
|
}
|
||||||
|
if self._auth_header:
|
||||||
|
headers["Authorization"] = self._auth_header
|
||||||
|
|
||||||
last_err: Exception | None = None
|
last_err: Exception | None = None
|
||||||
for _ in range(self.ready_retries):
|
for _ in range(self.ready_retries):
|
||||||
try:
|
try:
|
||||||
# Keep it cheap and JSON-parseable.
|
resp = await self._client.post(
|
||||||
await self.query_json("ASK WHERE { ?s ?p ?o }")
|
self.endpoint,
|
||||||
|
headers=headers,
|
||||||
|
data={"query": "ASK WHERE { ?s ?p ?o }"},
|
||||||
|
timeout=self.ready_timeout_s,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
# Ensure it's JSON, not HTML/text during boot.
|
||||||
|
resp.json()
|
||||||
return
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
last_err = e
|
last_err = e
|
||||||
await asyncio.sleep(self.ready_delay_s)
|
await asyncio.sleep(self.ready_delay_s)
|
||||||
|
|
||||||
raise RuntimeError(f"AnzoGraph not ready at {self.endpoint}") from last_err
|
raise RuntimeError(f"AnzoGraph not ready at {self.endpoint}") from last_err
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
371
docs/anzograph-readiness-julia.md
Normal file
371
docs/anzograph-readiness-julia.md
Normal file
@@ -0,0 +1,371 @@
|
|||||||
|
# Waiting for AnzoGraph readiness from Julia (how this repo does it)
|
||||||
|
|
||||||
|
This repo runs a Julia pipeline (`julia/main.jl`) against an AnzoGraph SPARQL endpoint. The key problem is that **“container started” ≠ “SPARQL endpoint is ready to accept queries”**.
|
||||||
|
|
||||||
|
So, before the Julia code does anything that depends on SPARQL (like `LOAD <...>` or large `SELECT`s), it explicitly **waits until AnzoGraph is actually responding to a real SPARQL POST request with valid JSON results**.
|
||||||
|
|
||||||
|
This document explains the exact mechanism used here, why it works, and gives copy/paste-ready patterns you can transfer to another project.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1) Where the waiting happens (pipeline control flow)
|
||||||
|
|
||||||
|
In `julia/main.jl`, the entrypoint calls:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
# Step 1: Wait for AnzoGraph
|
||||||
|
wait_for_anzograph()
|
||||||
|
|
||||||
|
# Step 2: Load TTL file
|
||||||
|
result = sparql_update("LOAD <$SPARQL_DATA_FILE>")
|
||||||
|
```
|
||||||
|
|
||||||
|
So the “await” is not a Julia `Task`/`async` wait; it is a **blocking retry loop** that only returns when it can successfully execute a small SPARQL query.
|
||||||
|
|
||||||
|
Reference: `julia/main.jl` defines `wait_for_anzograph()` and calls it from `main()`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2) Why this is needed even with Docker Compose `depends_on`
|
||||||
|
|
||||||
|
This repo’s `docker-compose.yml` includes an AnzoGraph `healthcheck`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
anzograph:
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "curl -f http://localhost:8080/sparql || exit 1"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 30
|
||||||
|
start_period: 60s
|
||||||
|
```
|
||||||
|
|
||||||
|
However, `julia-layout` currently depends on `anzograph` with:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
depends_on:
|
||||||
|
anzograph:
|
||||||
|
condition: service_started
|
||||||
|
```
|
||||||
|
|
||||||
|
Meaning:
|
||||||
|
- Compose will ensure the **container process has started**.
|
||||||
|
- Compose does **not** guarantee the AnzoGraph HTTP/SPARQL endpoint is ready (unless you use `service_healthy`, and even then a “healthy GET” is not always equivalent to “SPARQL POST works with auth + JSON”).
|
||||||
|
|
||||||
|
So the Julia code includes its own readiness gate to prevent failures like:
|
||||||
|
- TCP connection refused (port not open yet)
|
||||||
|
- HTTP endpoint reachable but not fully initialized
|
||||||
|
- Non-JSON/HTML error responses while the service is still booting
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3) What “ready” means in this repo
|
||||||
|
|
||||||
|
In this repo, “AnzoGraph is ready” means:
|
||||||
|
|
||||||
|
1. An HTTP `POST` to `${SPARQL_HOST}/sparql` succeeds, with headers:
|
||||||
|
- `Content-Type: application/x-www-form-urlencoded`
|
||||||
|
- `Accept: application/sparql-results+json`
|
||||||
|
- `Authorization: Basic ...`
|
||||||
|
2. The body parses as SPARQL JSON results (`application/sparql-results+json`)
|
||||||
|
|
||||||
|
It does **not** strictly mean:
|
||||||
|
- Your dataset is already loaded
|
||||||
|
- The loaded data is fully indexed (that can matter in some systems after `LOAD`)
|
||||||
|
|
||||||
|
This repo uses readiness as a **“SPARQL endpoint is alive and speaking the protocol”** check.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4) The actual Julia implementation (as in `julia/main.jl`)
|
||||||
|
|
||||||
|
### 4.1 Configuration (endpoint + auth)
|
||||||
|
|
||||||
|
The Julia script builds endpoint and auth from environment variables:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
const SPARQL_HOST = get(ENV, "SPARQL_HOST", "http://localhost:8080")
|
||||||
|
const SPARQL_ENDPOINT = "$SPARQL_HOST/sparql"
|
||||||
|
const SPARQL_USER = get(ENV, "SPARQL_USER", "admin")
|
||||||
|
const SPARQL_PASS = get(ENV, "SPARQL_PASS", "Passw0rd1")
|
||||||
|
const AUTH_HEADER = "Basic " * base64encode("$SPARQL_USER:$SPARQL_PASS")
|
||||||
|
```
|
||||||
|
|
||||||
|
In Docker Compose for this repo, the Julia container overrides `SPARQL_HOST` to use the service DNS name:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
environment:
|
||||||
|
- SPARQL_HOST=http://anzograph:8080
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.2 The smoke query used for readiness
|
||||||
|
|
||||||
|
This is the query used in the wait loop:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
const SMOKE_TEST_QUERY = "SELECT ?s ?p ?o WHERE { ?s ?p ?o } LIMIT 3"
|
||||||
|
```
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- It’s intentionally small (`LIMIT 3`) to keep the readiness check cheap.
|
||||||
|
- It returns *some* bindings when data exists, but **even an empty dataset can still return a valid empty result set**. The code treats “valid response” as ready.
|
||||||
|
|
||||||
|
If you want a readiness check that does not depend on any data being present, an `ASK` query is also common:
|
||||||
|
|
||||||
|
```sparql
|
||||||
|
ASK WHERE { ?s ?p ?o }
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.3 SPARQL query function (request + minimal retry)
|
||||||
|
|
||||||
|
`sparql_query(query; retries=...)` is a generic helper that makes SPARQL POST requests:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
function sparql_query(query::String; retries::Int=5)::SparqlResult
|
||||||
|
for attempt in 1:retries
|
||||||
|
try
|
||||||
|
response = HTTP.post(
|
||||||
|
SPARQL_ENDPOINT,
|
||||||
|
[
|
||||||
|
"Content-Type" => "application/x-www-form-urlencoded",
|
||||||
|
"Accept" => "application/sparql-results+json",
|
||||||
|
"Authorization" => AUTH_HEADER
|
||||||
|
];
|
||||||
|
body = "query=" * HTTP.URIs.escapeuri(query)
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status == 200
|
||||||
|
json = JSON.parse(String(response.body))
|
||||||
|
return SparqlResult(json["results"]["bindings"])
|
||||||
|
elseif response.status >= 500 && attempt < retries
|
||||||
|
sleep(10)
|
||||||
|
continue
|
||||||
|
else
|
||||||
|
error("SPARQL query failed with status $(response.status)")
|
||||||
|
end
|
||||||
|
catch e
|
||||||
|
if attempt < retries
|
||||||
|
sleep(10)
|
||||||
|
continue
|
||||||
|
end
|
||||||
|
rethrow(e)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
error("SPARQL query failed after $retries attempts")
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
Important behaviors to preserve when transferring:
|
||||||
|
- It uses **POST** (not GET) to the SPARQL endpoint.
|
||||||
|
- It requires a **200** response and successfully parses SPARQL JSON results.
|
||||||
|
- It retries on:
|
||||||
|
- `>= 500` server errors
|
||||||
|
- network / protocol / parsing errors (caught exceptions)
|
||||||
|
|
||||||
|
### 4.4 The readiness gate: `wait_for_anzograph`
|
||||||
|
|
||||||
|
This is the “await until ready” logic:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
function wait_for_anzograph(max_retries::Int=30)::Bool
|
||||||
|
println("Waiting for AnzoGraph at $SPARQL_ENDPOINT...")
|
||||||
|
|
||||||
|
for attempt in 1:max_retries
|
||||||
|
try
|
||||||
|
smoke_result = sparql_query(SMOKE_TEST_QUERY; retries=1)
|
||||||
|
println(" AnzoGraph is ready (attempt $attempt, smoke rows=$(length(smoke_result.bindings)))")
|
||||||
|
return true
|
||||||
|
catch e
|
||||||
|
println(" Attempt $attempt/$max_retries: $(typeof(e))")
|
||||||
|
sleep(4)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
error("AnzoGraph not available after $max_retries attempts")
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
Why it calls `sparql_query(...; retries=1)`:
|
||||||
|
- It makes each outer “readiness attempt” a **single** request.
|
||||||
|
- The outer loop controls cadence (`sleep(4)`) and total wait time.
|
||||||
|
- This avoids “nested retry loops” (inner sleeps + outer sleeps) that can make waits much longer than intended.
|
||||||
|
|
||||||
|
Time bound in the current implementation:
|
||||||
|
- `max_retries = 30`
|
||||||
|
- `sleep(4)` between attempts
|
||||||
|
- Roughly ~120 seconds of waiting (plus request time).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5) What failures cause it to keep waiting
|
||||||
|
|
||||||
|
`wait_for_anzograph()` catches any exception thrown by `sparql_query()` and retries. In practice, that includes:
|
||||||
|
|
||||||
|
- **Connection errors** (DNS not ready, connection refused, etc.)
|
||||||
|
- **Timeouts** (if HTTP request takes too long and the library throws)
|
||||||
|
- **Non-200 HTTP statuses** that cause `error(...)`
|
||||||
|
- **Non-JSON / unexpected JSON** responses causing `JSON.parse(...)` to throw
|
||||||
|
|
||||||
|
That last point is a big reason a “real SPARQL request + parse” is stronger than just “ping the port”.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6) Transferable, self-contained version (recommended pattern)
|
||||||
|
|
||||||
|
If you want to reuse this in another project, it’s usually easier to:
|
||||||
|
- avoid globals,
|
||||||
|
- make endpoint/auth explicit,
|
||||||
|
- use a **time-based timeout** instead of `max_retries` (more robust),
|
||||||
|
- add request timeouts so the wait loop can’t hang forever on a single request.
|
||||||
|
|
||||||
|
Below is a drop-in module you can copy into your project.
|
||||||
|
|
||||||
|
```julia
|
||||||
|
module AnzoGraphReady
|
||||||
|
|
||||||
|
using HTTP
|
||||||
|
using JSON
|
||||||
|
using Base64
|
||||||
|
using Dates
|
||||||
|
|
||||||
|
struct SparqlResult
|
||||||
|
bindings::Vector{Dict{String, Any}}
|
||||||
|
end
|
||||||
|
|
||||||
|
function basic_auth_header(user::AbstractString, pass::AbstractString)::String
|
||||||
|
return "Basic " * base64encode("$(user):$(pass)")
|
||||||
|
end
|
||||||
|
|
||||||
|
function sparql_query(
|
||||||
|
endpoint::AbstractString,
|
||||||
|
auth_header::AbstractString,
|
||||||
|
query::AbstractString;
|
||||||
|
retries::Int = 1,
|
||||||
|
retry_sleep_s::Real = 2,
|
||||||
|
request_timeout_s::Real = 15,
|
||||||
|
)::SparqlResult
|
||||||
|
for attempt in 1:retries
|
||||||
|
try
|
||||||
|
response = HTTP.post(
|
||||||
|
String(endpoint),
|
||||||
|
[
|
||||||
|
"Content-Type" => "application/x-www-form-urlencoded",
|
||||||
|
"Accept" => "application/sparql-results+json",
|
||||||
|
"Authorization" => auth_header,
|
||||||
|
];
|
||||||
|
body = "query=" * HTTP.URIs.escapeuri(String(query)),
|
||||||
|
readtimeout = request_timeout_s,
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status != 200
|
||||||
|
error("SPARQL query failed with status $(response.status)")
|
||||||
|
end
|
||||||
|
|
||||||
|
parsed = JSON.parse(String(response.body))
|
||||||
|
bindings = get(get(parsed, "results", Dict()), "bindings", Any[])
|
||||||
|
return SparqlResult(Vector{Dict{String, Any}}(bindings))
|
||||||
|
catch e
|
||||||
|
if attempt < retries
|
||||||
|
sleep(retry_sleep_s)
|
||||||
|
continue
|
||||||
|
end
|
||||||
|
rethrow(e)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
error("sparql_query: unreachable")
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
Wait until AnzoGraph responds to a real SPARQL POST with parseable JSON.
|
||||||
|
|
||||||
|
This is the direct analog of this repo's `wait_for_anzograph()`, but with:
|
||||||
|
- a time-based timeout (`timeout`)
|
||||||
|
- a request timeout per attempt (`request_timeout_s`)
|
||||||
|
- simple exponential backoff
|
||||||
|
"""
|
||||||
|
function wait_for_anzograph(
|
||||||
|
endpoint::AbstractString,
|
||||||
|
auth_header::AbstractString;
|
||||||
|
timeout::Period = Minute(3),
|
||||||
|
initial_delay_s::Real = 0.5,
|
||||||
|
max_delay_s::Real = 5.0,
|
||||||
|
request_timeout_s::Real = 10.0,
|
||||||
|
query::AbstractString = "ASK WHERE { ?s ?p ?o }",
|
||||||
|
)::Nothing
|
||||||
|
deadline = now() + timeout
|
||||||
|
delay_s = initial_delay_s
|
||||||
|
|
||||||
|
while now() < deadline
|
||||||
|
try
|
||||||
|
# A single attempt: if it succeeds, we declare "ready".
|
||||||
|
sparql_query(
|
||||||
|
endpoint,
|
||||||
|
auth_header,
|
||||||
|
query;
|
||||||
|
retries = 1,
|
||||||
|
request_timeout_s = request_timeout_s,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
catch
|
||||||
|
sleep(delay_s)
|
||||||
|
delay_s = min(max_delay_s, delay_s * 1.5)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
error("AnzoGraph not available before timeout=$(timeout)")
|
||||||
|
end
|
||||||
|
|
||||||
|
end # module
|
||||||
|
```
|
||||||
|
|
||||||
|
Typical usage (matching this repo’s environment variables):
|
||||||
|
|
||||||
|
```julia
|
||||||
|
using .AnzoGraphReady
|
||||||
|
|
||||||
|
sparql_host = get(ENV, "SPARQL_HOST", "http://localhost:8080")
|
||||||
|
endpoint = "$(sparql_host)/sparql"
|
||||||
|
user = get(ENV, "SPARQL_USER", "admin")
|
||||||
|
pass = get(ENV, "SPARQL_PASS", "Passw0rd1")
|
||||||
|
|
||||||
|
auth = AnzoGraphReady.basic_auth_header(user, pass)
|
||||||
|
AnzoGraphReady.wait_for_anzograph(endpoint, auth; timeout=Minute(5))
|
||||||
|
|
||||||
|
# Now it is safe to LOAD / query.
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7) Optional: waiting for “data is ready” after `LOAD`
|
||||||
|
|
||||||
|
Some systems accept `LOAD` but need time before results show up reliably (indexing / transaction visibility).
|
||||||
|
If you run into that in your other project, add a second gate after `LOAD`, for example:
|
||||||
|
|
||||||
|
1) load, then
|
||||||
|
2) poll a query that must be true after load (e.g., “triple count > 0”, or a known IRI exists).
|
||||||
|
|
||||||
|
Example “post-load gate”:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
post_load_query = """
|
||||||
|
SELECT (COUNT(*) AS ?n)
|
||||||
|
WHERE { ?s ?p ?o }
|
||||||
|
"""
|
||||||
|
|
||||||
|
res = AnzoGraphReady.sparql_query(endpoint, auth, post_load_query; retries=1)
|
||||||
|
# Parse `?n` out of bindings and require it to be > 0; retry until it is.
|
||||||
|
```
|
||||||
|
|
||||||
|
(This repo does not currently enforce “non-empty”; it only enforces “SPARQL is working”.)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8) Practical checklist when transferring to another project
|
||||||
|
|
||||||
|
- Make readiness checks hit the **real SPARQL POST** path you will use in production.
|
||||||
|
- Require a **valid JSON parse**, not just “port open”.
|
||||||
|
- Add **per-request timeouts**, so a single hung request cannot hang the whole pipeline.
|
||||||
|
- Prefer **time-based overall timeout** for predictable behavior in CI.
|
||||||
|
- Keep the query **cheap** (`ASK` or `LIMIT 1/3`).
|
||||||
|
- If you use Docker Compose healthchecks, consider also using `depends_on: condition: service_healthy`, but still keep the in-app wait as a safety net (it’s closer to the real contract your code needs).
|
||||||
|
|
||||||
Reference in New Issue
Block a user