Graph access via SPARQL
This commit is contained in:
94
backend/app/graph_export.py
Normal file
94
backend/app/graph_export.py
Normal file
@@ -0,0 +1,94 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def edge_retrieval_query(*, edge_limit: int, include_bnodes: bool) -> str:
|
||||
bnode_filter = "" if include_bnodes else "FILTER(!isBlank(?s) && !isBlank(?o))"
|
||||
return f"""
|
||||
SELECT ?s ?p ?o
|
||||
WHERE {{
|
||||
?s ?p ?o .
|
||||
FILTER(!isLiteral(?o))
|
||||
FILTER(?p NOT IN (
|
||||
<http://www.w3.org/2000/01/rdf-schema#label>,
|
||||
<http://www.w3.org/2004/02/skos/core#prefLabel>,
|
||||
<http://www.w3.org/2004/02/skos/core#altLabel>
|
||||
))
|
||||
{bnode_filter}
|
||||
}}
|
||||
LIMIT {edge_limit}
|
||||
"""
|
||||
|
||||
|
||||
def graph_from_sparql_bindings(
|
||||
bindings: list[dict[str, Any]],
|
||||
*,
|
||||
node_limit: int,
|
||||
include_bnodes: bool,
|
||||
) -> tuple[list[dict[str, object]], list[dict[str, object]]]:
|
||||
"""
|
||||
Convert SPARQL JSON results bindings into:
|
||||
nodes: [{id, termType, iri, label}]
|
||||
edges: [{source, target, predicate}]
|
||||
|
||||
IDs are assigned densely (0..N-1) based on first occurrence in bindings.
|
||||
"""
|
||||
|
||||
node_id_by_key: dict[tuple[str, str], int] = {}
|
||||
node_meta: list[tuple[str, str]] = [] # (termType, iri)
|
||||
out_edges: list[dict[str, object]] = []
|
||||
|
||||
def term_to_key_and_iri(term: dict[str, Any]) -> tuple[tuple[str, str], tuple[str, str]] | None:
|
||||
t = term.get("type")
|
||||
v = term.get("value")
|
||||
if not t or v is None:
|
||||
return None
|
||||
if t == "literal":
|
||||
return None
|
||||
if t == "bnode":
|
||||
if not include_bnodes:
|
||||
return None
|
||||
# SPARQL JSON uses bnode identifiers without the "_:" prefix; we normalize to "_:id".
|
||||
return (("bnode", str(v)), ("bnode", f"_:{v}"))
|
||||
# Default to "uri".
|
||||
return (("uri", str(v)), ("uri", str(v)))
|
||||
|
||||
def get_or_add(term: dict[str, Any]) -> int | None:
|
||||
out = term_to_key_and_iri(term)
|
||||
if out is None:
|
||||
return None
|
||||
key, meta = out
|
||||
existing = node_id_by_key.get(key)
|
||||
if existing is not None:
|
||||
return existing
|
||||
if len(node_meta) >= node_limit:
|
||||
return None
|
||||
nid = len(node_meta)
|
||||
node_id_by_key[key] = nid
|
||||
node_meta.append(meta)
|
||||
return nid
|
||||
|
||||
for b in bindings:
|
||||
s_term = b.get("s") or {}
|
||||
o_term = b.get("o") or {}
|
||||
p_term = b.get("p") or {}
|
||||
|
||||
sid = get_or_add(s_term)
|
||||
oid = get_or_add(o_term)
|
||||
if sid is None or oid is None:
|
||||
continue
|
||||
|
||||
pred = p_term.get("value")
|
||||
if not pred:
|
||||
continue
|
||||
|
||||
out_edges.append({"source": sid, "target": oid, "predicate": str(pred)})
|
||||
|
||||
out_nodes = [
|
||||
{"id": i, "termType": term_type, "iri": iri, "label": None}
|
||||
for i, (term_type, iri) in enumerate(node_meta)
|
||||
]
|
||||
|
||||
return out_nodes, out_edges
|
||||
|
||||
@@ -5,6 +5,7 @@ from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI, HTTPException, Query
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from .graph_export import edge_retrieval_query, graph_from_sparql_bindings
|
||||
from .models import EdgesResponse, GraphResponse, NodesResponse, SparqlQueryRequest, StatsResponse
|
||||
from .rdf_store import RDFStore
|
||||
from .sparql_engine import AnzoGraphEngine, RdflibEngine, SparqlEngine, create_sparql_engine
|
||||
@@ -161,87 +162,13 @@ async def graph(
|
||||
) -> GraphResponse:
|
||||
sparql: SparqlEngine = app.state.sparql
|
||||
|
||||
if settings.graph_backend == "rdflib":
|
||||
store: RDFStore = app.state.store
|
||||
return GraphResponse(
|
||||
nodes=store.node_slice(offset=0, limit=node_limit),
|
||||
edges=store.edge_slice(offset=0, limit=edge_limit),
|
||||
)
|
||||
|
||||
# AnzoGraph mode: return a simple subgraph by pulling the first N triples.
|
||||
assert isinstance(sparql, AnzoGraphEngine)
|
||||
|
||||
edges_bnode_filter = "" if settings.include_bnodes else "FILTER(!isBlank(?s) && !isBlank(?o))"
|
||||
edges_q = f"""
|
||||
SELECT ?s ?p ?o
|
||||
WHERE {{
|
||||
?s ?p ?o .
|
||||
FILTER(!isLiteral(?o))
|
||||
FILTER(?p NOT IN (
|
||||
<http://www.w3.org/2000/01/rdf-schema#label>,
|
||||
<http://www.w3.org/2004/02/skos/core#prefLabel>,
|
||||
<http://www.w3.org/2004/02/skos/core#altLabel>
|
||||
))
|
||||
{edges_bnode_filter}
|
||||
}}
|
||||
LIMIT {edge_limit}
|
||||
"""
|
||||
|
||||
# Use SPARQL for graph export in BOTH modes so callers don't care which backend is in use.
|
||||
edges_q = edge_retrieval_query(edge_limit=edge_limit, include_bnodes=settings.include_bnodes)
|
||||
res = await sparql.query_json(edges_q)
|
||||
bindings = (((res.get("results") or {}).get("bindings")) or [])
|
||||
|
||||
node_id_by_key: dict[tuple[str, str], int] = {}
|
||||
node_meta: list[tuple[str, str]] = [] # (termType, iri)
|
||||
out_edges: list[dict[str, object]] = []
|
||||
|
||||
def _term_to_key_and_iri(term: dict[str, str]) -> tuple[tuple[str, str], tuple[str, str]] | None:
|
||||
t = term.get("type")
|
||||
v = term.get("value")
|
||||
if not t or v is None:
|
||||
return None
|
||||
if t == "literal":
|
||||
return None
|
||||
if t == "bnode" and not settings.include_bnodes:
|
||||
return None
|
||||
if t == "bnode":
|
||||
return (("bnode", v), ("bnode", f"_:{v}"))
|
||||
# Default to "uri".
|
||||
return (("uri", v), ("uri", v))
|
||||
|
||||
def _get_or_add(term: dict[str, str]) -> int | None:
|
||||
out = _term_to_key_and_iri(term)
|
||||
if out is None:
|
||||
return None
|
||||
key, meta = out
|
||||
existing = node_id_by_key.get(key)
|
||||
if existing is not None:
|
||||
return existing
|
||||
if len(node_meta) >= node_limit:
|
||||
return None
|
||||
nid = len(node_meta)
|
||||
node_id_by_key[key] = nid
|
||||
node_meta.append(meta)
|
||||
return nid
|
||||
|
||||
for b in bindings:
|
||||
s_term = b.get("s") or {}
|
||||
o_term = b.get("o") or {}
|
||||
p_term = b.get("p") or {}
|
||||
|
||||
sid = _get_or_add(s_term)
|
||||
oid = _get_or_add(o_term)
|
||||
if sid is None or oid is None:
|
||||
continue
|
||||
|
||||
pred = p_term.get("value")
|
||||
if not pred:
|
||||
continue
|
||||
|
||||
out_edges.append({"source": sid, "target": oid, "predicate": pred})
|
||||
|
||||
out_nodes = [
|
||||
{"id": i, "termType": term_type, "iri": iri, "label": None}
|
||||
for i, (term_type, iri) in enumerate(node_meta)
|
||||
]
|
||||
|
||||
return GraphResponse(nodes=out_nodes, edges=out_edges)
|
||||
nodes, edges = graph_from_sparql_bindings(
|
||||
bindings,
|
||||
node_limit=node_limit,
|
||||
include_bnodes=settings.include_bnodes,
|
||||
)
|
||||
return GraphResponse(nodes=nodes, edges=edges)
|
||||
|
||||
1
backend/app/pipelines/__init__.py
Normal file
1
backend/app/pipelines/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
153
backend/app/pipelines/subclass_labels.py
Normal file
153
backend/app/pipelines/subclass_labels.py
Normal file
@@ -0,0 +1,153 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from ..sparql_engine import SparqlEngine
|
||||
|
||||
RDFS_SUBCLASS_OF = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
|
||||
RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
|
||||
|
||||
|
||||
def _bindings(res: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
return (((res.get("results") or {}).get("bindings")) or [])
|
||||
|
||||
|
||||
def _term_key(term: dict[str, Any]) -> tuple[str, str] | None:
|
||||
t = term.get("type")
|
||||
v = term.get("value")
|
||||
if not t or v is None:
|
||||
return None
|
||||
if t == "literal":
|
||||
return None
|
||||
if t == "bnode":
|
||||
return ("bnode", str(v))
|
||||
return ("uri", str(v))
|
||||
|
||||
|
||||
def _key_to_entity_string(key: tuple[str, str]) -> str:
|
||||
t, v = key
|
||||
if t == "bnode":
|
||||
return f"_:{v}"
|
||||
return v
|
||||
|
||||
|
||||
def _label_score(binding: dict[str, Any]) -> int:
|
||||
"""
|
||||
Higher is better.
|
||||
Prefer English, then no-language, then anything else.
|
||||
"""
|
||||
lang = (binding.get("xml:lang") or "").lower()
|
||||
if lang == "en":
|
||||
return 3
|
||||
if lang == "":
|
||||
return 2
|
||||
return 1
|
||||
|
||||
|
||||
async def extract_subclass_entities_and_labels(
|
||||
sparql: SparqlEngine,
|
||||
*,
|
||||
include_bnodes: bool,
|
||||
label_batch_size: int = 500,
|
||||
) -> tuple[list[str], list[str | None]]:
|
||||
"""
|
||||
Pipeline:
|
||||
1) Query all rdfs:subClassOf triples.
|
||||
2) Build a unique set of entity terms from subjects+objects, convert to list.
|
||||
3) Fetch rdfs:label for those entities and return an aligned labels list.
|
||||
|
||||
Returns:
|
||||
entities: list[str] (IRI or "_:bnodeId")
|
||||
labels: list[str|None], aligned with entities
|
||||
"""
|
||||
|
||||
subclass_q = f"""
|
||||
SELECT ?s ?o
|
||||
WHERE {{
|
||||
?s <{RDFS_SUBCLASS_OF}> ?o .
|
||||
FILTER(!isLiteral(?o))
|
||||
{"FILTER(!isBlank(?s) && !isBlank(?o))" if not include_bnodes else ""}
|
||||
}}
|
||||
"""
|
||||
res = await sparql.query_json(subclass_q)
|
||||
|
||||
entity_keys: set[tuple[str, str]] = set()
|
||||
for b in _bindings(res):
|
||||
sk = _term_key(b.get("s") or {})
|
||||
ok = _term_key(b.get("o") or {})
|
||||
if sk is not None and (include_bnodes or sk[0] != "bnode"):
|
||||
entity_keys.add(sk)
|
||||
if ok is not None and (include_bnodes or ok[0] != "bnode"):
|
||||
entity_keys.add(ok)
|
||||
|
||||
# Deterministic ordering.
|
||||
entity_key_list = sorted(entity_keys, key=lambda k: (k[0], k[1]))
|
||||
entities = [_key_to_entity_string(k) for k in entity_key_list]
|
||||
|
||||
# Build label map keyed by term key.
|
||||
best_label_by_key: dict[tuple[str, str], tuple[int, str]] = {}
|
||||
|
||||
# URIs can be batch-queried via VALUES.
|
||||
uri_values = [v for (t, v) in entity_key_list if t == "uri"]
|
||||
for i in range(0, len(uri_values), label_batch_size):
|
||||
batch = uri_values[i : i + label_batch_size]
|
||||
values = " ".join(f"<{u}>" for u in batch)
|
||||
labels_q = f"""
|
||||
SELECT ?s ?label
|
||||
WHERE {{
|
||||
VALUES ?s {{ {values} }}
|
||||
?s <{RDFS_LABEL}> ?label .
|
||||
}}
|
||||
"""
|
||||
lres = await sparql.query_json(labels_q)
|
||||
for b in _bindings(lres):
|
||||
sk = _term_key(b.get("s") or {})
|
||||
if sk is None or sk[0] != "uri":
|
||||
continue
|
||||
label_term = b.get("label") or {}
|
||||
if label_term.get("type") != "literal":
|
||||
continue
|
||||
label_value = label_term.get("value")
|
||||
if label_value is None:
|
||||
continue
|
||||
|
||||
score = _label_score(label_term)
|
||||
prev = best_label_by_key.get(sk)
|
||||
if prev is None or score > prev[0]:
|
||||
best_label_by_key[sk] = (score, str(label_value))
|
||||
|
||||
# Blank nodes can't reliably be addressed by ID across queries, but if enabled we can still
|
||||
# fetch all bnode labels and filter locally.
|
||||
if include_bnodes:
|
||||
bnode_keys = {k for k in entity_key_list if k[0] == "bnode"}
|
||||
if bnode_keys:
|
||||
bnode_labels_q = f"""
|
||||
SELECT ?s ?label
|
||||
WHERE {{
|
||||
?s <{RDFS_LABEL}> ?label .
|
||||
FILTER(isBlank(?s))
|
||||
}}
|
||||
"""
|
||||
blres = await sparql.query_json(bnode_labels_q)
|
||||
for b in _bindings(blres):
|
||||
sk = _term_key(b.get("s") or {})
|
||||
if sk is None or sk not in bnode_keys:
|
||||
continue
|
||||
label_term = b.get("label") or {}
|
||||
if label_term.get("type") != "literal":
|
||||
continue
|
||||
label_value = label_term.get("value")
|
||||
if label_value is None:
|
||||
continue
|
||||
score = _label_score(label_term)
|
||||
prev = best_label_by_key.get(sk)
|
||||
if prev is None or score > prev[0]:
|
||||
best_label_by_key[sk] = (score, str(label_value))
|
||||
|
||||
labels: list[str | None] = []
|
||||
for k in entity_key_list:
|
||||
item = best_label_by_key.get(k)
|
||||
labels.append(item[1] if item else None)
|
||||
|
||||
return entities, labels
|
||||
|
||||
@@ -132,3 +132,19 @@ class RDFStore:
|
||||
}
|
||||
)
|
||||
return out
|
||||
|
||||
def edges_within_nodes(self, *, max_node_id_exclusive: int, limit: int) -> list[dict[str, Any]]:
|
||||
out: list[dict[str, Any]] = []
|
||||
for row in self._edges:
|
||||
if row.source >= max_node_id_exclusive or row.target >= max_node_id_exclusive:
|
||||
continue
|
||||
out.append(
|
||||
{
|
||||
"source": row.source,
|
||||
"target": row.target,
|
||||
"predicate": row.predicate,
|
||||
}
|
||||
)
|
||||
if len(out) >= limit:
|
||||
break
|
||||
return out
|
||||
|
||||
@@ -33,6 +33,7 @@ class Settings(BaseSettings):
|
||||
sparql_timeout_s: float = Field(default=300.0, alias="SPARQL_TIMEOUT_S")
|
||||
sparql_ready_retries: int = Field(default=30, alias="SPARQL_READY_RETRIES")
|
||||
sparql_ready_delay_s: float = Field(default=4.0, alias="SPARQL_READY_DELAY_S")
|
||||
sparql_ready_timeout_s: float = Field(default=10.0, alias="SPARQL_READY_TIMEOUT_S")
|
||||
|
||||
# Comma-separated, or "*" (default).
|
||||
cors_origins: str = Field(default="*", alias="CORS_ORIGINS")
|
||||
|
||||
@@ -56,6 +56,7 @@ class AnzoGraphEngine:
|
||||
self.timeout_s = settings.sparql_timeout_s
|
||||
self.ready_retries = settings.sparql_ready_retries
|
||||
self.ready_delay_s = settings.sparql_ready_delay_s
|
||||
self.ready_timeout_s = settings.sparql_ready_timeout_s
|
||||
|
||||
self.user = settings.sparql_user
|
||||
self.password = settings.sparql_pass
|
||||
@@ -135,15 +136,34 @@ class AnzoGraphEngine:
|
||||
resp.raise_for_status()
|
||||
|
||||
async def _wait_ready(self) -> None:
|
||||
if self._client is None:
|
||||
raise RuntimeError("AnzoGraphEngine not started")
|
||||
|
||||
# Match the repo's Julia readiness gate: real SPARQL POST + valid JSON parse.
|
||||
headers = {
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
"Accept": "application/sparql-results+json",
|
||||
}
|
||||
if self._auth_header:
|
||||
headers["Authorization"] = self._auth_header
|
||||
|
||||
last_err: Exception | None = None
|
||||
for _ in range(self.ready_retries):
|
||||
try:
|
||||
# Keep it cheap and JSON-parseable.
|
||||
await self.query_json("ASK WHERE { ?s ?p ?o }")
|
||||
resp = await self._client.post(
|
||||
self.endpoint,
|
||||
headers=headers,
|
||||
data={"query": "ASK WHERE { ?s ?p ?o }"},
|
||||
timeout=self.ready_timeout_s,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
# Ensure it's JSON, not HTML/text during boot.
|
||||
resp.json()
|
||||
return
|
||||
except Exception as e:
|
||||
last_err = e
|
||||
await asyncio.sleep(self.ready_delay_s)
|
||||
|
||||
raise RuntimeError(f"AnzoGraph not ready at {self.endpoint}") from last_err
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user