nyx-probing/nyx_probing/analysis/readiness_scorer.py

"""
Readiness Scorer: Combines surface and echo probes into curriculum guidance.

Outputs:
- HIGH: Ready for direct training / state machine
- MEDIUM: Needs scaffolding or bridging concepts
- LOW: Requires foundational work first
"""
from typing import Optional, List
from dataclasses import dataclass

from ..core.model import NyxModel
from ..core.probe_result import (
    SurfaceProbeResult,
    EchoProbeResult,
    ReadinessResult,
    ReadinessLevel,
    EchoType,
)
from ..probes.surface_probe import SurfaceProbe, CompletionCategory
from ..probes.echo_probe import EchoProbe


# Recommended actions for each readiness level
ACTIONS = {
    ReadinessLevel.HIGH: "state_machine",    # Direct training
    ReadinessLevel.MEDIUM: "scaffolding",    # Bridge concepts
    ReadinessLevel.LOW: "foundational",      # Build from scratch
}


class ReadinessScorer:
    """
    Combines surface + echo probes to assess curriculum readiness.

    A term is ready for training when:
    1. Surface: Coherent associations (not scattered/random)
    2. Echo: Can expand beyond surface (depth > 0)
    3. Valley: In a productive valley (prose/philosophy, not just code)
    """

    def __init__(
        self,
        model: NyxModel,
        surface_runs: int = 3,
        echo_rounds: int = 3,
        max_new_tokens: int = 50,
    ):
        self.model = model
        self.surface_probe = SurfaceProbe(
            model,
            num_runs=surface_runs,
            max_new_tokens=max_new_tokens,
        )
        self.echo_probe = EchoProbe(
            model,
            max_rounds=echo_rounds,
            max_new_tokens=max_new_tokens,
        )

    def score(self, term: str) -> ReadinessResult:
        """
        Assess readiness of a term for curriculum.

        Args:
            term: Word or phrase to assess

        Returns:
            ReadinessResult with level, action, and supporting evidence
        """
        # Run both probes
        surface = self.surface_probe.probe(term)
        echo = self.echo_probe.probe(term)

        # Classify valley from surface probe
        classification = self.surface_probe.classify_completions(surface)
        dominant_valley = classification['dominant']

        # Calculate composite score
        level, reasoning = self._calculate_level(
            surface=surface,
            echo=echo,
            dominant_valley=dominant_valley,
        )

        return ReadinessResult(
            term=term,
            level=level,
            action=ACTIONS[level],
            surface=surface,
            echo=echo,
            reasoning=reasoning,
        )

    def _calculate_level(
        self,
        surface: SurfaceProbeResult,
        echo: EchoProbeResult,
        dominant_valley: str,
    ) -> tuple[ReadinessLevel, str]:
        """
        Calculate readiness level based on probe results.

        Heuristics:
        - HIGH: depth >= 2 AND coherence >= 0.5 AND not pure code
        - MEDIUM: depth >= 1 OR (coherence >= 0.5 AND prose/philosophy)
        - LOW: everything else
        """
        depth = echo.depth
        coherence = surface.coherence_score or 0.0
        eos_ratio = surface.hit_eos_count / len(surface.completions) if surface.completions else 0

        # Count echo types
        expands = sum(1 for t in echo.echo_types if t == EchoType.EXPANDS)
        collapses = sum(1 for t in echo.echo_types if t == EchoType.COLLAPSE)
        circulars = sum(1 for t in echo.echo_types if t == EchoType.CIRCULAR)

        # Build reasoning
        reasons = []

        # HIGH: Good depth + coherence + productive valley
        if depth >= 2 and coherence >= 0.4:
            if dominant_valley not in [CompletionCategory.CODE]:
                reasons.append(f"depth={depth} (strong conceptual expansion)")
                reasons.append(f"coherence={coherence:.2f} (consistent associations)")
                reasons.append(f"valley={dominant_valley} (productive for training)")
                return ReadinessLevel.HIGH, "; ".join(reasons)

        # HIGH: Exceptional depth even with lower coherence
        if depth >= 3:
            reasons.append(f"depth={depth} (exceptional expansion)")
            reasons.append(f"all {expands} echoes expand")
            return ReadinessLevel.HIGH, "; ".join(reasons)

        # MEDIUM: Some depth or good coherence in prose
        if depth >= 1:
            reasons.append(f"depth={depth} (some expansion capability)")
            if dominant_valley in [CompletionCategory.PROSE, 'prose', 'definition']:
                reasons.append(f"valley={dominant_valley} (trainable with scaffolding)")
                return ReadinessLevel.MEDIUM, "; ".join(reasons)

        if coherence >= 0.5 and dominant_valley not in [CompletionCategory.CODE, 'code']:
            reasons.append(f"coherence={coherence:.2f} (consistent surface)")
            reasons.append(f"valley={dominant_valley}")
            reasons.append("but limited depth - needs bridging concepts")
            return ReadinessLevel.MEDIUM, "; ".join(reasons)

        # LOW: Trapped in code, circular, or incoherent
        if dominant_valley in [CompletionCategory.CODE, 'code']:
            reasons.append(f"valley=CODE (trapped in technical patterns)")
        if circulars >= 2:
            reasons.append(f"{circulars} circular echoes (surface-only knowledge)")
        if collapses >= 1:
            reasons.append(f"{collapses} collapses (unstable representations)")
        if coherence < 0.4:
            reasons.append(f"coherence={coherence:.2f} (scattered associations)")

        return ReadinessLevel.LOW, "; ".join(reasons) if reasons else "insufficient depth and coherence"

    def score_batch(self, terms: List[str]) -> List[ReadinessResult]:
        """Score multiple terms."""
        return [self.score(term) for term in terms]

    def summary(self, result: ReadinessResult) -> str:
        """Generate human-readable summary."""
        symbols = {
            ReadinessLevel.HIGH: "🟢",
            ReadinessLevel.MEDIUM: "🟡",
            ReadinessLevel.LOW: "🔴",
        }

        surface_summary = f"coherence={result.surface.coherence_score:.2f}" if result.surface else "N/A"
        echo_summary = f"depth={result.echo.depth}" if result.echo else "N/A"

        lines = [
            f"{symbols[result.level]} {result.term}: {result.level.value}",
            f"  Action: {result.action}",
            f"  Surface: {surface_summary}",
            f"  Echo: {echo_summary}",
            f"  Reasoning: {result.reasoning}",
        ]
        return "\n".join(lines)

    def curriculum_report(self, results: List[ReadinessResult]) -> str:
        """Generate curriculum planning report."""
        high = [r for r in results if r.level == ReadinessLevel.HIGH]
        medium = [r for r in results if r.level == ReadinessLevel.MEDIUM]
        low = [r for r in results if r.level == ReadinessLevel.LOW]

        lines = [
            "=" * 60,
            "CURRICULUM READINESS REPORT",
            "=" * 60,
            "",
            f"🟢 HIGH ({len(high)} terms) - Ready for state machine:",
        ]
        for r in high:
            lines.append(f"    • {r.term}")

        lines.extend([
            "",
            f"🟡 MEDIUM ({len(medium)} terms) - Need scaffolding:",
        ])
        for r in medium:
            lines.append(f"    • {r.term}: {r.reasoning[:60]}...")

        lines.extend([
            "",
            f"🔴 LOW ({len(low)} terms) - Require foundational work:",
        ])
        for r in low:
            lines.append(f"    • {r.term}: {r.reasoning[:60]}...")

        lines.extend([
            "",
            "=" * 60,
            f"Summary: {len(high)}/{len(results)} ready, {len(medium)} scaffolding, {len(low)} foundational",
            "=" * 60,
        ])

        return "\n".join(lines)