visualizador_instanciados/backend/app/pipelines/graph_snapshot.py

from __future__ import annotations

from typing import Any

from ..graph_export import edge_retrieval_query, graph_from_sparql_bindings
from ..models import GraphResponse
from ..sparql_engine import SparqlEngine
from ..settings import Settings
from .layout_dag_radial import CycleError, level_synchronous_kahn_layers, radial_positions_from_layers


RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"


def _bindings(res: dict[str, Any]) -> list[dict[str, Any]]:
    return (((res.get("results") or {}).get("bindings")) or [])


def _label_score(label_binding: dict[str, Any]) -> int:
    # Prefer English, then no-language, then anything else.
    lang = (label_binding.get("xml:lang") or "").lower()
    if lang == "en":
        return 3
    if lang == "":
        return 2
    return 1


async def _fetch_rdfs_labels_for_iris(
    sparql: SparqlEngine,
    iris: list[str],
    *,
    batch_size: int = 500,
) -> dict[str, str]:
    best: dict[str, tuple[int, str]] = {}

    for i in range(0, len(iris), batch_size):
        batch = iris[i : i + batch_size]
        values = " ".join(f"<{u}>" for u in batch)
        q = f"""
SELECT ?s ?label
WHERE {{
  VALUES ?s {{ {values} }}
  ?s <{RDFS_LABEL}> ?label .
}}
"""
        res = await sparql.query_json(q)
        for b in _bindings(res):
            s = (b.get("s") or {}).get("value")
            label_term = b.get("label") or {}
            if not s or label_term.get("type") != "literal":
                continue
            label_value = label_term.get("value")
            if label_value is None:
                continue
            score = _label_score(label_term)
            prev = best.get(s)
            if prev is None or score > prev[0]:
                best[s] = (score, str(label_value))

    return {iri: lbl for iri, (_, lbl) in best.items()}


async def fetch_graph_snapshot(
    sparql: SparqlEngine,
    *,
    settings: Settings,
    node_limit: int,
    edge_limit: int,
) -> GraphResponse:
    """
    Fetch a graph snapshot (nodes + edges) via SPARQL, independent of whether the
    underlying engine is RDFLib or AnzoGraph.
    """
    edges_q = edge_retrieval_query(edge_limit=edge_limit, include_bnodes=settings.include_bnodes)
    res = await sparql.query_json(edges_q)
    bindings = (((res.get("results") or {}).get("bindings")) or [])
    nodes, edges = graph_from_sparql_bindings(
        bindings,
        node_limit=node_limit,
        include_bnodes=settings.include_bnodes,
    )

    # Add positions so the frontend doesn't need to run a layout.
    #
    # We are exporting only rdfs:subClassOf triples. In the exported edges:
    #   source = subclass, target = superclass
    # For hierarchical layout we invert edges to:
    #   superclass -> subclass
    hier_edges: list[tuple[int, int]] = []
    for e in edges:
        s = e.get("source")
        t = e.get("target")
        try:
            sid = int(s)  # subclass
            tid = int(t)  # superclass
        except Exception:
            continue
        hier_edges.append((tid, sid))

    try:
        layers = level_synchronous_kahn_layers(node_count=len(nodes), edges=hier_edges)
    except CycleError as e:
        # Add a small URI sample to aid debugging.
        sample: list[str] = []
        for nid in e.remaining_node_ids[:20]:
            try:
                sample.append(str(nodes[nid].get("iri")))
            except Exception:
                continue
        raise CycleError(
            processed=e.processed,
            total=e.total,
            remaining_node_ids=e.remaining_node_ids,
            remaining_iri_sample=sample or None,
        ) from None

    # Deterministic order within each ring/layer for stable layouts.
    id_to_iri = [str(n.get("iri", "")) for n in nodes]
    for layer in layers:
        layer.sort(key=lambda nid: id_to_iri[nid])

    xs, ys = radial_positions_from_layers(node_count=len(nodes), layers=layers)
    for i, node in enumerate(nodes):
        node["x"] = float(xs[i])
        node["y"] = float(ys[i])

    # Attach labels for URI nodes (blank nodes remain label-less).
    uri_nodes = [n for n in nodes if n.get("termType") == "uri"]
    if uri_nodes:
        iris = [str(n["iri"]) for n in uri_nodes if isinstance(n.get("iri"), str)]
        label_by_iri = await _fetch_rdfs_labels_for_iris(sparql, iris)
        for n in uri_nodes:
            iri = n.get("iri")
            if isinstance(iri, str) and iri in label_by_iri:
                n["label"] = label_by_iri[iri]

    meta = GraphResponse.Meta(
        backend=sparql.name,
        ttl_path=settings.ttl_path if settings.graph_backend == "rdflib" else None,
        sparql_endpoint=settings.effective_sparql_endpoint() if settings.graph_backend == "anzograph" else None,
        include_bnodes=settings.include_bnodes,
        node_limit=node_limit,
        edge_limit=edge_limit,
        nodes=len(nodes),
        edges=len(edges),
    )
    return GraphResponse(nodes=nodes, edges=edges, meta=meta)