diff --git a/backend/app/graph_export.py b/backend/app/graph_export.py new file mode 100644 index 0000000..b937c9f --- /dev/null +++ b/backend/app/graph_export.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +from typing import Any + + +def edge_retrieval_query(*, edge_limit: int, include_bnodes: bool) -> str: + bnode_filter = "" if include_bnodes else "FILTER(!isBlank(?s) && !isBlank(?o))" + return f""" +SELECT ?s ?p ?o +WHERE {{ + ?s ?p ?o . + FILTER(!isLiteral(?o)) + FILTER(?p NOT IN ( + , + , + + )) + {bnode_filter} +}} +LIMIT {edge_limit} +""" + + +def graph_from_sparql_bindings( + bindings: list[dict[str, Any]], + *, + node_limit: int, + include_bnodes: bool, +) -> tuple[list[dict[str, object]], list[dict[str, object]]]: + """ + Convert SPARQL JSON results bindings into: + nodes: [{id, termType, iri, label}] + edges: [{source, target, predicate}] + + IDs are assigned densely (0..N-1) based on first occurrence in bindings. + """ + + node_id_by_key: dict[tuple[str, str], int] = {} + node_meta: list[tuple[str, str]] = [] # (termType, iri) + out_edges: list[dict[str, object]] = [] + + def term_to_key_and_iri(term: dict[str, Any]) -> tuple[tuple[str, str], tuple[str, str]] | None: + t = term.get("type") + v = term.get("value") + if not t or v is None: + return None + if t == "literal": + return None + if t == "bnode": + if not include_bnodes: + return None + # SPARQL JSON uses bnode identifiers without the "_:" prefix; we normalize to "_:id". + return (("bnode", str(v)), ("bnode", f"_:{v}")) + # Default to "uri". + return (("uri", str(v)), ("uri", str(v))) + + def get_or_add(term: dict[str, Any]) -> int | None: + out = term_to_key_and_iri(term) + if out is None: + return None + key, meta = out + existing = node_id_by_key.get(key) + if existing is not None: + return existing + if len(node_meta) >= node_limit: + return None + nid = len(node_meta) + node_id_by_key[key] = nid + node_meta.append(meta) + return nid + + for b in bindings: + s_term = b.get("s") or {} + o_term = b.get("o") or {} + p_term = b.get("p") or {} + + sid = get_or_add(s_term) + oid = get_or_add(o_term) + if sid is None or oid is None: + continue + + pred = p_term.get("value") + if not pred: + continue + + out_edges.append({"source": sid, "target": oid, "predicate": str(pred)}) + + out_nodes = [ + {"id": i, "termType": term_type, "iri": iri, "label": None} + for i, (term_type, iri) in enumerate(node_meta) + ] + + return out_nodes, out_edges + diff --git a/backend/app/main.py b/backend/app/main.py index c5b3e81..0b6cb68 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -5,6 +5,7 @@ from contextlib import asynccontextmanager from fastapi import FastAPI, HTTPException, Query from fastapi.middleware.cors import CORSMiddleware +from .graph_export import edge_retrieval_query, graph_from_sparql_bindings from .models import EdgesResponse, GraphResponse, NodesResponse, SparqlQueryRequest, StatsResponse from .rdf_store import RDFStore from .sparql_engine import AnzoGraphEngine, RdflibEngine, SparqlEngine, create_sparql_engine @@ -161,87 +162,13 @@ async def graph( ) -> GraphResponse: sparql: SparqlEngine = app.state.sparql - if settings.graph_backend == "rdflib": - store: RDFStore = app.state.store - return GraphResponse( - nodes=store.node_slice(offset=0, limit=node_limit), - edges=store.edge_slice(offset=0, limit=edge_limit), - ) - - # AnzoGraph mode: return a simple subgraph by pulling the first N triples. - assert isinstance(sparql, AnzoGraphEngine) - - edges_bnode_filter = "" if settings.include_bnodes else "FILTER(!isBlank(?s) && !isBlank(?o))" - edges_q = f""" -SELECT ?s ?p ?o -WHERE {{ - ?s ?p ?o . - FILTER(!isLiteral(?o)) - FILTER(?p NOT IN ( - , - , - - )) - {edges_bnode_filter} -}} -LIMIT {edge_limit} -""" - + # Use SPARQL for graph export in BOTH modes so callers don't care which backend is in use. + edges_q = edge_retrieval_query(edge_limit=edge_limit, include_bnodes=settings.include_bnodes) res = await sparql.query_json(edges_q) bindings = (((res.get("results") or {}).get("bindings")) or []) - - node_id_by_key: dict[tuple[str, str], int] = {} - node_meta: list[tuple[str, str]] = [] # (termType, iri) - out_edges: list[dict[str, object]] = [] - - def _term_to_key_and_iri(term: dict[str, str]) -> tuple[tuple[str, str], tuple[str, str]] | None: - t = term.get("type") - v = term.get("value") - if not t or v is None: - return None - if t == "literal": - return None - if t == "bnode" and not settings.include_bnodes: - return None - if t == "bnode": - return (("bnode", v), ("bnode", f"_:{v}")) - # Default to "uri". - return (("uri", v), ("uri", v)) - - def _get_or_add(term: dict[str, str]) -> int | None: - out = _term_to_key_and_iri(term) - if out is None: - return None - key, meta = out - existing = node_id_by_key.get(key) - if existing is not None: - return existing - if len(node_meta) >= node_limit: - return None - nid = len(node_meta) - node_id_by_key[key] = nid - node_meta.append(meta) - return nid - - for b in bindings: - s_term = b.get("s") or {} - o_term = b.get("o") or {} - p_term = b.get("p") or {} - - sid = _get_or_add(s_term) - oid = _get_or_add(o_term) - if sid is None or oid is None: - continue - - pred = p_term.get("value") - if not pred: - continue - - out_edges.append({"source": sid, "target": oid, "predicate": pred}) - - out_nodes = [ - {"id": i, "termType": term_type, "iri": iri, "label": None} - for i, (term_type, iri) in enumerate(node_meta) - ] - - return GraphResponse(nodes=out_nodes, edges=out_edges) + nodes, edges = graph_from_sparql_bindings( + bindings, + node_limit=node_limit, + include_bnodes=settings.include_bnodes, + ) + return GraphResponse(nodes=nodes, edges=edges) diff --git a/backend/app/pipelines/__init__.py b/backend/app/pipelines/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/backend/app/pipelines/__init__.py @@ -0,0 +1 @@ + diff --git a/backend/app/pipelines/subclass_labels.py b/backend/app/pipelines/subclass_labels.py new file mode 100644 index 0000000..5d6b6b9 --- /dev/null +++ b/backend/app/pipelines/subclass_labels.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +from typing import Any + +from ..sparql_engine import SparqlEngine + +RDFS_SUBCLASS_OF = "http://www.w3.org/2000/01/rdf-schema#subClassOf" +RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label" + + +def _bindings(res: dict[str, Any]) -> list[dict[str, Any]]: + return (((res.get("results") or {}).get("bindings")) or []) + + +def _term_key(term: dict[str, Any]) -> tuple[str, str] | None: + t = term.get("type") + v = term.get("value") + if not t or v is None: + return None + if t == "literal": + return None + if t == "bnode": + return ("bnode", str(v)) + return ("uri", str(v)) + + +def _key_to_entity_string(key: tuple[str, str]) -> str: + t, v = key + if t == "bnode": + return f"_:{v}" + return v + + +def _label_score(binding: dict[str, Any]) -> int: + """ + Higher is better. + Prefer English, then no-language, then anything else. + """ + lang = (binding.get("xml:lang") or "").lower() + if lang == "en": + return 3 + if lang == "": + return 2 + return 1 + + +async def extract_subclass_entities_and_labels( + sparql: SparqlEngine, + *, + include_bnodes: bool, + label_batch_size: int = 500, +) -> tuple[list[str], list[str | None]]: + """ + Pipeline: + 1) Query all rdfs:subClassOf triples. + 2) Build a unique set of entity terms from subjects+objects, convert to list. + 3) Fetch rdfs:label for those entities and return an aligned labels list. + + Returns: + entities: list[str] (IRI or "_:bnodeId") + labels: list[str|None], aligned with entities + """ + + subclass_q = f""" +SELECT ?s ?o +WHERE {{ + ?s <{RDFS_SUBCLASS_OF}> ?o . + FILTER(!isLiteral(?o)) + {"FILTER(!isBlank(?s) && !isBlank(?o))" if not include_bnodes else ""} +}} +""" + res = await sparql.query_json(subclass_q) + + entity_keys: set[tuple[str, str]] = set() + for b in _bindings(res): + sk = _term_key(b.get("s") or {}) + ok = _term_key(b.get("o") or {}) + if sk is not None and (include_bnodes or sk[0] != "bnode"): + entity_keys.add(sk) + if ok is not None and (include_bnodes or ok[0] != "bnode"): + entity_keys.add(ok) + + # Deterministic ordering. + entity_key_list = sorted(entity_keys, key=lambda k: (k[0], k[1])) + entities = [_key_to_entity_string(k) for k in entity_key_list] + + # Build label map keyed by term key. + best_label_by_key: dict[tuple[str, str], tuple[int, str]] = {} + + # URIs can be batch-queried via VALUES. + uri_values = [v for (t, v) in entity_key_list if t == "uri"] + for i in range(0, len(uri_values), label_batch_size): + batch = uri_values[i : i + label_batch_size] + values = " ".join(f"<{u}>" for u in batch) + labels_q = f""" +SELECT ?s ?label +WHERE {{ + VALUES ?s {{ {values} }} + ?s <{RDFS_LABEL}> ?label . +}} +""" + lres = await sparql.query_json(labels_q) + for b in _bindings(lres): + sk = _term_key(b.get("s") or {}) + if sk is None or sk[0] != "uri": + continue + label_term = b.get("label") or {} + if label_term.get("type") != "literal": + continue + label_value = label_term.get("value") + if label_value is None: + continue + + score = _label_score(label_term) + prev = best_label_by_key.get(sk) + if prev is None or score > prev[0]: + best_label_by_key[sk] = (score, str(label_value)) + + # Blank nodes can't reliably be addressed by ID across queries, but if enabled we can still + # fetch all bnode labels and filter locally. + if include_bnodes: + bnode_keys = {k for k in entity_key_list if k[0] == "bnode"} + if bnode_keys: + bnode_labels_q = f""" +SELECT ?s ?label +WHERE {{ + ?s <{RDFS_LABEL}> ?label . + FILTER(isBlank(?s)) +}} +""" + blres = await sparql.query_json(bnode_labels_q) + for b in _bindings(blres): + sk = _term_key(b.get("s") or {}) + if sk is None or sk not in bnode_keys: + continue + label_term = b.get("label") or {} + if label_term.get("type") != "literal": + continue + label_value = label_term.get("value") + if label_value is None: + continue + score = _label_score(label_term) + prev = best_label_by_key.get(sk) + if prev is None or score > prev[0]: + best_label_by_key[sk] = (score, str(label_value)) + + labels: list[str | None] = [] + for k in entity_key_list: + item = best_label_by_key.get(k) + labels.append(item[1] if item else None) + + return entities, labels + diff --git a/backend/app/rdf_store.py b/backend/app/rdf_store.py index 925c72c..0e2fa16 100644 --- a/backend/app/rdf_store.py +++ b/backend/app/rdf_store.py @@ -132,3 +132,19 @@ class RDFStore: } ) return out + + def edges_within_nodes(self, *, max_node_id_exclusive: int, limit: int) -> list[dict[str, Any]]: + out: list[dict[str, Any]] = [] + for row in self._edges: + if row.source >= max_node_id_exclusive or row.target >= max_node_id_exclusive: + continue + out.append( + { + "source": row.source, + "target": row.target, + "predicate": row.predicate, + } + ) + if len(out) >= limit: + break + return out diff --git a/backend/app/settings.py b/backend/app/settings.py index 5c02764..4288572 100644 --- a/backend/app/settings.py +++ b/backend/app/settings.py @@ -33,6 +33,7 @@ class Settings(BaseSettings): sparql_timeout_s: float = Field(default=300.0, alias="SPARQL_TIMEOUT_S") sparql_ready_retries: int = Field(default=30, alias="SPARQL_READY_RETRIES") sparql_ready_delay_s: float = Field(default=4.0, alias="SPARQL_READY_DELAY_S") + sparql_ready_timeout_s: float = Field(default=10.0, alias="SPARQL_READY_TIMEOUT_S") # Comma-separated, or "*" (default). cors_origins: str = Field(default="*", alias="CORS_ORIGINS") diff --git a/backend/app/sparql_engine.py b/backend/app/sparql_engine.py index 763a7f5..9f41ac3 100644 --- a/backend/app/sparql_engine.py +++ b/backend/app/sparql_engine.py @@ -56,6 +56,7 @@ class AnzoGraphEngine: self.timeout_s = settings.sparql_timeout_s self.ready_retries = settings.sparql_ready_retries self.ready_delay_s = settings.sparql_ready_delay_s + self.ready_timeout_s = settings.sparql_ready_timeout_s self.user = settings.sparql_user self.password = settings.sparql_pass @@ -135,15 +136,34 @@ class AnzoGraphEngine: resp.raise_for_status() async def _wait_ready(self) -> None: + if self._client is None: + raise RuntimeError("AnzoGraphEngine not started") + + # Match the repo's Julia readiness gate: real SPARQL POST + valid JSON parse. + headers = { + "Content-Type": "application/x-www-form-urlencoded", + "Accept": "application/sparql-results+json", + } + if self._auth_header: + headers["Authorization"] = self._auth_header + last_err: Exception | None = None for _ in range(self.ready_retries): try: - # Keep it cheap and JSON-parseable. - await self.query_json("ASK WHERE { ?s ?p ?o }") + resp = await self._client.post( + self.endpoint, + headers=headers, + data={"query": "ASK WHERE { ?s ?p ?o }"}, + timeout=self.ready_timeout_s, + ) + resp.raise_for_status() + # Ensure it's JSON, not HTML/text during boot. + resp.json() return except Exception as e: last_err = e await asyncio.sleep(self.ready_delay_s) + raise RuntimeError(f"AnzoGraph not ready at {self.endpoint}") from last_err diff --git a/docs/anzograph-readiness-julia.md b/docs/anzograph-readiness-julia.md new file mode 100644 index 0000000..e5d5bbb --- /dev/null +++ b/docs/anzograph-readiness-julia.md @@ -0,0 +1,371 @@ +# Waiting for AnzoGraph readiness from Julia (how this repo does it) + +This repo runs a Julia pipeline (`julia/main.jl`) against an AnzoGraph SPARQL endpoint. The key problem is that **“container started” ≠ “SPARQL endpoint is ready to accept queries”**. + +So, before the Julia code does anything that depends on SPARQL (like `LOAD <...>` or large `SELECT`s), it explicitly **waits until AnzoGraph is actually responding to a real SPARQL POST request with valid JSON results**. + +This document explains the exact mechanism used here, why it works, and gives copy/paste-ready patterns you can transfer to another project. + +--- + +## 1) Where the waiting happens (pipeline control flow) + +In `julia/main.jl`, the entrypoint calls: + +```julia +# Step 1: Wait for AnzoGraph +wait_for_anzograph() + +# Step 2: Load TTL file +result = sparql_update("LOAD <$SPARQL_DATA_FILE>") +``` + +So the “await” is not a Julia `Task`/`async` wait; it is a **blocking retry loop** that only returns when it can successfully execute a small SPARQL query. + +Reference: `julia/main.jl` defines `wait_for_anzograph()` and calls it from `main()`. + +--- + +## 2) Why this is needed even with Docker Compose `depends_on` + +This repo’s `docker-compose.yml` includes an AnzoGraph `healthcheck`: + +```yaml +anzograph: + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:8080/sparql || exit 1"] + interval: 10s + timeout: 5s + retries: 30 + start_period: 60s +``` + +However, `julia-layout` currently depends on `anzograph` with: + +```yaml +depends_on: + anzograph: + condition: service_started +``` + +Meaning: +- Compose will ensure the **container process has started**. +- Compose does **not** guarantee the AnzoGraph HTTP/SPARQL endpoint is ready (unless you use `service_healthy`, and even then a “healthy GET” is not always equivalent to “SPARQL POST works with auth + JSON”). + +So the Julia code includes its own readiness gate to prevent failures like: +- TCP connection refused (port not open yet) +- HTTP endpoint reachable but not fully initialized +- Non-JSON/HTML error responses while the service is still booting + +--- + +## 3) What “ready” means in this repo + +In this repo, “AnzoGraph is ready” means: + +1. An HTTP `POST` to `${SPARQL_HOST}/sparql` succeeds, with headers: + - `Content-Type: application/x-www-form-urlencoded` + - `Accept: application/sparql-results+json` + - `Authorization: Basic ...` +2. The body parses as SPARQL JSON results (`application/sparql-results+json`) + +It does **not** strictly mean: +- Your dataset is already loaded +- The loaded data is fully indexed (that can matter in some systems after `LOAD`) + +This repo uses readiness as a **“SPARQL endpoint is alive and speaking the protocol”** check. + +--- + +## 4) The actual Julia implementation (as in `julia/main.jl`) + +### 4.1 Configuration (endpoint + auth) + +The Julia script builds endpoint and auth from environment variables: + +```julia +const SPARQL_HOST = get(ENV, "SPARQL_HOST", "http://localhost:8080") +const SPARQL_ENDPOINT = "$SPARQL_HOST/sparql" +const SPARQL_USER = get(ENV, "SPARQL_USER", "admin") +const SPARQL_PASS = get(ENV, "SPARQL_PASS", "Passw0rd1") +const AUTH_HEADER = "Basic " * base64encode("$SPARQL_USER:$SPARQL_PASS") +``` + +In Docker Compose for this repo, the Julia container overrides `SPARQL_HOST` to use the service DNS name: + +```yaml +environment: + - SPARQL_HOST=http://anzograph:8080 +``` + +### 4.2 The smoke query used for readiness + +This is the query used in the wait loop: + +```julia +const SMOKE_TEST_QUERY = "SELECT ?s ?p ?o WHERE { ?s ?p ?o } LIMIT 3" +``` + +Notes: +- It’s intentionally small (`LIMIT 3`) to keep the readiness check cheap. +- It returns *some* bindings when data exists, but **even an empty dataset can still return a valid empty result set**. The code treats “valid response” as ready. + +If you want a readiness check that does not depend on any data being present, an `ASK` query is also common: + +```sparql +ASK WHERE { ?s ?p ?o } +``` + +### 4.3 SPARQL query function (request + minimal retry) + +`sparql_query(query; retries=...)` is a generic helper that makes SPARQL POST requests: + +```julia +function sparql_query(query::String; retries::Int=5)::SparqlResult + for attempt in 1:retries + try + response = HTTP.post( + SPARQL_ENDPOINT, + [ + "Content-Type" => "application/x-www-form-urlencoded", + "Accept" => "application/sparql-results+json", + "Authorization" => AUTH_HEADER + ]; + body = "query=" * HTTP.URIs.escapeuri(query) + ) + + if response.status == 200 + json = JSON.parse(String(response.body)) + return SparqlResult(json["results"]["bindings"]) + elseif response.status >= 500 && attempt < retries + sleep(10) + continue + else + error("SPARQL query failed with status $(response.status)") + end + catch e + if attempt < retries + sleep(10) + continue + end + rethrow(e) + end + end + error("SPARQL query failed after $retries attempts") +end +``` + +Important behaviors to preserve when transferring: +- It uses **POST** (not GET) to the SPARQL endpoint. +- It requires a **200** response and successfully parses SPARQL JSON results. +- It retries on: + - `>= 500` server errors + - network / protocol / parsing errors (caught exceptions) + +### 4.4 The readiness gate: `wait_for_anzograph` + +This is the “await until ready” logic: + +```julia +function wait_for_anzograph(max_retries::Int=30)::Bool + println("Waiting for AnzoGraph at $SPARQL_ENDPOINT...") + + for attempt in 1:max_retries + try + smoke_result = sparql_query(SMOKE_TEST_QUERY; retries=1) + println(" AnzoGraph is ready (attempt $attempt, smoke rows=$(length(smoke_result.bindings)))") + return true + catch e + println(" Attempt $attempt/$max_retries: $(typeof(e))") + sleep(4) + end + end + + error("AnzoGraph not available after $max_retries attempts") +end +``` + +Why it calls `sparql_query(...; retries=1)`: +- It makes each outer “readiness attempt” a **single** request. +- The outer loop controls cadence (`sleep(4)`) and total wait time. +- This avoids “nested retry loops” (inner sleeps + outer sleeps) that can make waits much longer than intended. + +Time bound in the current implementation: +- `max_retries = 30` +- `sleep(4)` between attempts +- Roughly ~120 seconds of waiting (plus request time). + +--- + +## 5) What failures cause it to keep waiting + +`wait_for_anzograph()` catches any exception thrown by `sparql_query()` and retries. In practice, that includes: + +- **Connection errors** (DNS not ready, connection refused, etc.) +- **Timeouts** (if HTTP request takes too long and the library throws) +- **Non-200 HTTP statuses** that cause `error(...)` +- **Non-JSON / unexpected JSON** responses causing `JSON.parse(...)` to throw + +That last point is a big reason a “real SPARQL request + parse” is stronger than just “ping the port”. + +--- + +## 6) Transferable, self-contained version (recommended pattern) + +If you want to reuse this in another project, it’s usually easier to: +- avoid globals, +- make endpoint/auth explicit, +- use a **time-based timeout** instead of `max_retries` (more robust), +- add request timeouts so the wait loop can’t hang forever on a single request. + +Below is a drop-in module you can copy into your project. + +```julia +module AnzoGraphReady + +using HTTP +using JSON +using Base64 +using Dates + +struct SparqlResult + bindings::Vector{Dict{String, Any}} +end + +function basic_auth_header(user::AbstractString, pass::AbstractString)::String + return "Basic " * base64encode("$(user):$(pass)") +end + +function sparql_query( + endpoint::AbstractString, + auth_header::AbstractString, + query::AbstractString; + retries::Int = 1, + retry_sleep_s::Real = 2, + request_timeout_s::Real = 15, +)::SparqlResult + for attempt in 1:retries + try + response = HTTP.post( + String(endpoint), + [ + "Content-Type" => "application/x-www-form-urlencoded", + "Accept" => "application/sparql-results+json", + "Authorization" => auth_header, + ]; + body = "query=" * HTTP.URIs.escapeuri(String(query)), + readtimeout = request_timeout_s, + ) + + if response.status != 200 + error("SPARQL query failed with status $(response.status)") + end + + parsed = JSON.parse(String(response.body)) + bindings = get(get(parsed, "results", Dict()), "bindings", Any[]) + return SparqlResult(Vector{Dict{String, Any}}(bindings)) + catch e + if attempt < retries + sleep(retry_sleep_s) + continue + end + rethrow(e) + end + end + error("sparql_query: unreachable") +end + +""" +Wait until AnzoGraph responds to a real SPARQL POST with parseable JSON. + +This is the direct analog of this repo's `wait_for_anzograph()`, but with: +- a time-based timeout (`timeout`) +- a request timeout per attempt (`request_timeout_s`) +- simple exponential backoff +""" +function wait_for_anzograph( + endpoint::AbstractString, + auth_header::AbstractString; + timeout::Period = Minute(3), + initial_delay_s::Real = 0.5, + max_delay_s::Real = 5.0, + request_timeout_s::Real = 10.0, + query::AbstractString = "ASK WHERE { ?s ?p ?o }", +)::Nothing + deadline = now() + timeout + delay_s = initial_delay_s + + while now() < deadline + try + # A single attempt: if it succeeds, we declare "ready". + sparql_query( + endpoint, + auth_header, + query; + retries = 1, + request_timeout_s = request_timeout_s, + ) + return + catch + sleep(delay_s) + delay_s = min(max_delay_s, delay_s * 1.5) + end + end + + error("AnzoGraph not available before timeout=$(timeout)") +end + +end # module +``` + +Typical usage (matching this repo’s environment variables): + +```julia +using .AnzoGraphReady + +sparql_host = get(ENV, "SPARQL_HOST", "http://localhost:8080") +endpoint = "$(sparql_host)/sparql" +user = get(ENV, "SPARQL_USER", "admin") +pass = get(ENV, "SPARQL_PASS", "Passw0rd1") + +auth = AnzoGraphReady.basic_auth_header(user, pass) +AnzoGraphReady.wait_for_anzograph(endpoint, auth; timeout=Minute(5)) + +# Now it is safe to LOAD / query. +``` + +--- + +## 7) Optional: waiting for “data is ready” after `LOAD` + +Some systems accept `LOAD` but need time before results show up reliably (indexing / transaction visibility). +If you run into that in your other project, add a second gate after `LOAD`, for example: + +1) load, then +2) poll a query that must be true after load (e.g., “triple count > 0”, or a known IRI exists). + +Example “post-load gate”: + +```julia +post_load_query = """ +SELECT (COUNT(*) AS ?n) +WHERE { ?s ?p ?o } +""" + +res = AnzoGraphReady.sparql_query(endpoint, auth, post_load_query; retries=1) +# Parse `?n` out of bindings and require it to be > 0; retry until it is. +``` + +(This repo does not currently enforce “non-empty”; it only enforces “SPARQL is working”.) + +--- + +## 8) Practical checklist when transferring to another project + +- Make readiness checks hit the **real SPARQL POST** path you will use in production. +- Require a **valid JSON parse**, not just “port open”. +- Add **per-request timeouts**, so a single hung request cannot hang the whole pipeline. +- Prefer **time-based overall timeout** for predictable behavior in CI. +- Keep the query **cheap** (`ASK` or `LIMIT 1/3`). +- If you use Docker Compose healthchecks, consider also using `depends_on: condition: service_healthy`, but still keep the in-app wait as a safety net (it’s closer to the real contract your code needs). +