Import Solver + neighbors via sparql query

2026-03-04 13:49:14 -03:00
parent d4bfa5f064
commit a75b5b93da
15 changed files with 747 additions and 463 deletions
--- a/backend/app/pipelines/graph_snapshot.py
+++ b/backend/app/pipelines/graph_snapshot.py
@@ -1,10 +1,64 @@
 from __future__ import annotations

+from typing import Any
+
 from ..graph_export import edge_retrieval_query, graph_from_sparql_bindings
 from ..models import GraphResponse
 from ..sparql_engine import SparqlEngine
 from ..settings import Settings
-from .layout_spiral import spiral_positions
+from .layout_dag_radial import CycleError, level_synchronous_kahn_layers, radial_positions_from_layers
+
+
+RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
+
+
+def _bindings(res: dict[str, Any]) -> list[dict[str, Any]]:
+    return (((res.get("results") or {}).get("bindings")) or [])
+
+
+def _label_score(label_binding: dict[str, Any]) -> int:
+    # Prefer English, then no-language, then anything else.
+    lang = (label_binding.get("xml:lang") or "").lower()
+    if lang == "en":
+        return 3
+    if lang == "":
+        return 2
+    return 1
+
+
+async def _fetch_rdfs_labels_for_iris(
+    sparql: SparqlEngine,
+    iris: list[str],
+    *,
+    batch_size: int = 500,
+) -> dict[str, str]:
+    best: dict[str, tuple[int, str]] = {}
+
+    for i in range(0, len(iris), batch_size):
+        batch = iris[i : i + batch_size]
+        values = " ".join(f"<{u}>" for u in batch)
+        q = f"""
+SELECT ?s ?label
+WHERE {{
+  VALUES ?s {{ {values} }}
+  ?s <{RDFS_LABEL}> ?label .
+}}
+"""
+        res = await sparql.query_json(q)
+        for b in _bindings(res):
+            s = (b.get("s") or {}).get("value")
+            label_term = b.get("label") or {}
+            if not s or label_term.get("type") != "literal":
+                continue
+            label_value = label_term.get("value")
+            if label_value is None:
+                continue
+            score = _label_score(label_term)
+            prev = best.get(s)
+            if prev is None or score > prev[0]:
+                best[s] = (score, str(label_value))
+
+    return {iri: lbl for iri, (_, lbl) in best.items()}


 async def fetch_graph_snapshot(
@@ -28,11 +82,59 @@ async def fetch_graph_snapshot(
    )

    # Add positions so the frontend doesn't need to run a layout.
-    xs, ys = spiral_positions(len(nodes))
+    #
+    # We are exporting only rdfs:subClassOf triples. In the exported edges:
+    #   source = subclass, target = superclass
+    # For hierarchical layout we invert edges to:
+    #   superclass -> subclass
+    hier_edges: list[tuple[int, int]] = []
+    for e in edges:
+        s = e.get("source")
+        t = e.get("target")
+        try:
+            sid = int(s)  # subclass
+            tid = int(t)  # superclass
+        except Exception:
+            continue
+        hier_edges.append((tid, sid))
+
+    try:
+        layers = level_synchronous_kahn_layers(node_count=len(nodes), edges=hier_edges)
+    except CycleError as e:
+        # Add a small URI sample to aid debugging.
+        sample: list[str] = []
+        for nid in e.remaining_node_ids[:20]:
+            try:
+                sample.append(str(nodes[nid].get("iri")))
+            except Exception:
+                continue
+        raise CycleError(
+            processed=e.processed,
+            total=e.total,
+            remaining_node_ids=e.remaining_node_ids,
+            remaining_iri_sample=sample or None,
+        ) from None
+
+    # Deterministic order within each ring/layer for stable layouts.
+    id_to_iri = [str(n.get("iri", "")) for n in nodes]
+    for layer in layers:
+        layer.sort(key=lambda nid: id_to_iri[nid])
+
+    xs, ys = radial_positions_from_layers(node_count=len(nodes), layers=layers)
    for i, node in enumerate(nodes):
        node["x"] = float(xs[i])
        node["y"] = float(ys[i])

+    # Attach labels for URI nodes (blank nodes remain label-less).
+    uri_nodes = [n for n in nodes if n.get("termType") == "uri"]
+    if uri_nodes:
+        iris = [str(n["iri"]) for n in uri_nodes if isinstance(n.get("iri"), str)]
+        label_by_iri = await _fetch_rdfs_labels_for_iris(sparql, iris)
+        for n in uri_nodes:
+            iri = n.get("iri")
+            if isinstance(iri, str) and iri in label_by_iri:
+                n["label"] = label_by_iri[iri]
+
    meta = GraphResponse.Meta(
        backend=sparql.name,
        ttl_path=settings.ttl_path if settings.graph_backend == "rdflib" else None,
--- a/backend/app/pipelines/layout_dag_radial.py
+++ b/backend/app/pipelines/layout_dag_radial.py
@@ -0,0 +1,141 @@
+from __future__ import annotations
+
+import math
+from collections import deque
+from typing import Iterable, Sequence
+
+
+class CycleError(RuntimeError):
+    """
+    Raised when the requested layout requires a DAG, but a cycle is detected.
+
+    `remaining_node_ids` are the node ids that still had indegree > 0 after Kahn.
+    """
+
+    def __init__(
+        self,
+        *,
+        processed: int,
+        total: int,
+        remaining_node_ids: list[int],
+        remaining_iri_sample: list[str] | None = None,
+    ) -> None:
+        self.processed = int(processed)
+        self.total = int(total)
+        self.remaining_node_ids = remaining_node_ids
+        self.remaining_iri_sample = remaining_iri_sample
+
+        msg = f"Cycle detected in subClassOf graph (processed {self.processed}/{self.total} nodes)."
+        if remaining_iri_sample:
+            msg += f" Example nodes: {', '.join(remaining_iri_sample)}"
+        super().__init__(msg)
+
+
+def level_synchronous_kahn_layers(
+    *,
+    node_count: int,
+    edges: Iterable[tuple[int, int]],
+) -> list[list[int]]:
+    """
+    Level-synchronous Kahn's algorithm:
+    - process the entire current queue as one batch (one layer)
+    - only then enqueue newly-unlocked nodes for the next batch
+
+    `edges` are directed (u -> v).
+    """
+    n = int(node_count)
+    if n <= 0:
+        return []
+
+    adj: list[list[int]] = [[] for _ in range(n)]
+    indeg = [0] * n
+
+    for u, v in edges:
+        if u == v:
+            # Self-loops don't help layout and would trivially violate DAG-ness.
+            continue
+        if not (0 <= u < n and 0 <= v < n):
+            continue
+        adj[u].append(v)
+        indeg[v] += 1
+
+    q: deque[int] = deque(i for i, d in enumerate(indeg) if d == 0)
+    layers: list[list[int]] = []
+
+    processed = 0
+    while q:
+        # Consume the full current queue as a single layer.
+        layer = list(q)
+        q.clear()
+        layers.append(layer)
+
+        for u in layer:
+            processed += 1
+            for v in adj[u]:
+                indeg[v] -= 1
+                if indeg[v] == 0:
+                    q.append(v)
+
+    if processed != n:
+        remaining = [i for i, d in enumerate(indeg) if d > 0]
+        raise CycleError(processed=processed, total=n, remaining_node_ids=remaining)
+
+    return layers
+
+
+def radial_positions_from_layers(
+    *,
+    node_count: int,
+    layers: Sequence[Sequence[int]],
+    max_r: float = 5000.0,
+) -> tuple[list[float], list[float]]:
+    """
+    Assign node positions in concentric rings (one ring per layer).
+
+    - radius increases with layer index
+    - nodes within a layer are placed evenly by angle
+    - each ring gets a "golden-angle" rotation to reduce spoke artifacts
+    """
+    n = int(node_count)
+    if n <= 0:
+        return ([], [])
+
+    xs = [0.0] * n
+    ys = [0.0] * n
+    if not layers:
+        return (xs, ys)
+
+    two_pi = 2.0 * math.pi
+    golden = math.pi * (3.0 - math.sqrt(5.0))
+
+    layer_count = len(layers)
+    denom = float(layer_count + 1)
+
+    for li, layer in enumerate(layers):
+        m = len(layer)
+        if m <= 0:
+            continue
+
+        # Keep everything within ~[-max_r, max_r] like the previous spiral layout.
+        r = ((li + 1) / denom) * max_r
+
+        # Rotate each layer deterministically to avoid radial spokes aligning.
+        offset = (li * golden) % two_pi
+
+        if m == 1:
+            nid = int(layer[0])
+            if 0 <= nid < n:
+                xs[nid] = r * math.cos(offset)
+                ys[nid] = r * math.sin(offset)
+            continue
+
+        step = two_pi / float(m)
+        for j, raw_id in enumerate(layer):
+            nid = int(raw_id)
+            if not (0 <= nid < n):
+                continue
+            t = offset + step * float(j)
+            xs[nid] = r * math.cos(t)
+            ys[nid] = r * math.sin(t)
+
+    return (xs, ys)
--- a/backend/app/pipelines/owl_imports_combiner.py
+++ b/backend/app/pipelines/owl_imports_combiner.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+import logging
+import os
+from pathlib import Path
+from urllib.parse import unquote, urlparse
+
+from rdflib import Graph
+from rdflib.namespace import OWL
+
+
+logger = logging.getLogger(__name__)
+
+
+def _is_http_url(location: str) -> bool:
+    scheme = urlparse(location).scheme.lower()
+    return scheme in {"http", "https"}
+
+
+def _is_file_uri(location: str) -> bool:
+    return urlparse(location).scheme.lower() == "file"
+
+
+def _file_uri_to_path(location: str) -> Path:
+    u = urlparse(location)
+    if u.scheme.lower() != "file":
+        raise ValueError(f"Not a file:// URI: {location!r}")
+    return Path(unquote(u.path))
+
+
+def resolve_output_location(
+    entry_location: str,
+    *,
+    output_location: str | None,
+    output_name: str,
+) -> str:
+    if output_location:
+        return output_location
+
+    if _is_http_url(entry_location):
+        raise ValueError(
+            "COMBINE_ENTRY_LOCATION points to an http(s) URL; set COMBINE_OUTPUT_LOCATION to a writable file path."
+        )
+
+    entry_path = _file_uri_to_path(entry_location) if _is_file_uri(entry_location) else Path(entry_location)
+    return str(entry_path.parent / output_name)
+
+
+def _output_destination_to_path(output_location: str) -> Path:
+    if _is_file_uri(output_location):
+        return _file_uri_to_path(output_location)
+    if _is_http_url(output_location):
+        raise ValueError("Output location must be a local file path (or file:// URI), not http(s).")
+    return Path(output_location)
+
+
+def output_location_to_path(output_location: str) -> Path:
+    return _output_destination_to_path(output_location)
+
+
+def build_combined_graph(entry_location: str) -> Graph:
+    """
+    Recursively loads an RDF document (file path, file:// URI, or http(s) URL) and its
+    owl:imports into a single in-memory graph.
+    """
+    combined_graph = Graph()
+    visited_locations: set[str] = set()
+
+    def resolve_imports(location: str) -> None:
+        if location in visited_locations:
+            return
+        visited_locations.add(location)
+
+        logger.info("Loading ontology: %s", location)
+        try:
+            combined_graph.parse(location=location)
+        except Exception as e:
+            logger.warning("Failed to load %s (%s)", location, e)
+            return
+
+        imports = [str(o) for _, _, o in combined_graph.triples((None, OWL.imports, None))]
+        for imported_location in imports:
+            if imported_location not in visited_locations:
+                resolve_imports(imported_location)
+
+    resolve_imports(entry_location)
+    return combined_graph
+
+
+def serialize_graph_to_ttl(graph: Graph, output_location: str) -> None:
+    output_path = _output_destination_to_path(output_location)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    tmp_path = output_path.with_suffix(output_path.suffix + ".tmp")
+    graph.serialize(destination=str(tmp_path), format="turtle")
+    os.replace(str(tmp_path), str(output_path))
--- a/backend/app/pipelines/selection_neighbors.py
+++ b/backend/app/pipelines/selection_neighbors.py
@@ -0,0 +1,137 @@
+from __future__ import annotations
+
+from typing import Any, Iterable
+
+from ..models import GraphResponse, Node
+from ..sparql_engine import SparqlEngine
+
+
+def _values_term(node: Node) -> str | None:
+    iri = node.iri
+    if node.termType == "uri":
+        return f"<{iri}>"
+    if node.termType == "bnode":
+        if iri.startswith("_:"):
+            return iri
+        return f"_:{iri}"
+    return None
+
+
+def selection_neighbors_query(*, selected_nodes: Iterable[Node], include_bnodes: bool) -> str:
+    values_terms: list[str] = []
+    for n in selected_nodes:
+        t = _values_term(n)
+        if t is None:
+            continue
+        values_terms.append(t)
+
+    if not values_terms:
+        # Caller should avoid running this query when selection is empty, but keep this safe.
+        return "SELECT ?nbr WHERE { FILTER(false) }"
+
+    bnode_filter = "" if include_bnodes else "FILTER(!isBlank(?nbr))"
+    values = " ".join(values_terms)
+
+    # Neighbors are defined as any node directly connected by rdf:type (to owl:Class)
+    # or rdfs:subClassOf, in either direction (treating edges as undirected).
+    return f"""
+PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX owl: <http://www.w3.org/2002/07/owl#>
+
+SELECT DISTINCT ?nbr
+WHERE {{
+  VALUES ?sel {{ {values} }}
+  {{
+    ?sel rdf:type ?o .
+    ?o rdf:type owl:Class .
+    BIND(?o AS ?nbr)
+  }}
+  UNION
+  {{
+    ?s rdf:type ?sel .
+    ?sel rdf:type owl:Class .
+    BIND(?s AS ?nbr)
+  }}
+  UNION
+  {{
+    ?sel rdfs:subClassOf ?o .
+    BIND(?o AS ?nbr)
+  }}
+  UNION
+  {{
+    ?s rdfs:subClassOf ?sel .
+    BIND(?s AS ?nbr)
+  }}
+  FILTER(!isLiteral(?nbr))
+  FILTER(?nbr != ?sel)
+  {bnode_filter}
+}}
+"""
+
+
+def _bindings(res: dict[str, Any]) -> list[dict[str, Any]]:
+    return (((res.get("results") or {}).get("bindings")) or [])
+
+
+def _term_key(term: dict[str, Any], *, include_bnodes: bool) -> tuple[str, str] | None:
+    t = term.get("type")
+    v = term.get("value")
+    if not t or v is None:
+        return None
+    if t == "literal":
+        return None
+    if t == "bnode":
+        if not include_bnodes:
+            return None
+        return ("bnode", f"_:{v}")
+    return ("uri", str(v))
+
+
+async def fetch_neighbor_ids_for_selection(
+    sparql: SparqlEngine,
+    *,
+    snapshot: GraphResponse,
+    selected_ids: list[int],
+    include_bnodes: bool,
+) -> list[int]:
+    id_to_node: dict[int, Node] = {n.id: n for n in snapshot.nodes}
+
+    selected_nodes: list[Node] = []
+    selected_id_set: set[int] = set()
+    for nid in selected_ids:
+        if not isinstance(nid, int):
+            continue
+        n = id_to_node.get(nid)
+        if n is None:
+            continue
+        if n.termType == "bnode" and not include_bnodes:
+            continue
+        selected_nodes.append(n)
+        selected_id_set.add(nid)
+
+    if not selected_nodes:
+        return []
+
+    key_to_id: dict[tuple[str, str], int] = {}
+    for n in snapshot.nodes:
+        key_to_id[(n.termType, n.iri)] = n.id
+
+    q = selection_neighbors_query(selected_nodes=selected_nodes, include_bnodes=include_bnodes)
+    res = await sparql.query_json(q)
+
+    neighbor_ids: set[int] = set()
+    for b in _bindings(res):
+        nbr_term = b.get("nbr") or {}
+        key = _term_key(nbr_term, include_bnodes=include_bnodes)
+        if key is None:
+            continue
+        nid = key_to_id.get(key)
+        if nid is None:
+            continue
+        if nid in selected_id_set:
+            continue
+        neighbor_ids.add(nid)
+
+    # Stable ordering for consistent frontend behavior.
+    return sorted(neighbor_ids)