feat: complete Phase 1 - vocabulary expansion & DriftProbe infrastructure

- CLI: nyx-probe scan with --summary/--delta/--full flags - DriftProbe: training safety with Gini coefficient + Angular Drift - Vocabulary: 54 terms (30 nimmerverse + 24 German philosophical) - Sentinels: ANCHOR/BRIDGE/CANARY/TARGET monitoring system Key findings: - German philosophical terms: 37.5% depth≥2 hit rate (vs 3.3% nimmerverse) - Super Cluster validated: heart cross-lang sim = 1.000 - Isolated Zone confirmed: being EN↔DE sim = 0.195 - Gini signature: Philosophy ~0.5 (diffuse), Technical ~0.8 (sparse) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-06 22:39:03 +01:00
parent 9853f4767b
commit f640dbdd65
29 changed files with 6164 additions and 1 deletions
--- a/nyx_probing/analysis/readiness_scorer.py
+++ b/nyx_probing/analysis/readiness_scorer.py
@@ -0,0 +1,221 @@
+"""
+Readiness Scorer: Combines surface and echo probes into curriculum guidance.
+
+Outputs:
+- HIGH: Ready for direct training / state machine
+- MEDIUM: Needs scaffolding or bridging concepts
+- LOW: Requires foundational work first
+"""
+from typing import Optional, List
+from dataclasses import dataclass
+
+from ..core.model import NyxModel
+from ..core.probe_result import (
+    SurfaceProbeResult,
+    EchoProbeResult,
+    ReadinessResult,
+    ReadinessLevel,
+    EchoType,
+)
+from ..probes.surface_probe import SurfaceProbe, CompletionCategory
+from ..probes.echo_probe import EchoProbe
+
+
+# Recommended actions for each readiness level
+ACTIONS = {
+    ReadinessLevel.HIGH: "state_machine",    # Direct training
+    ReadinessLevel.MEDIUM: "scaffolding",    # Bridge concepts
+    ReadinessLevel.LOW: "foundational",      # Build from scratch
+}
+
+
+class ReadinessScorer:
+    """
+    Combines surface + echo probes to assess curriculum readiness.
+
+    A term is ready for training when:
+    1. Surface: Coherent associations (not scattered/random)
+    2. Echo: Can expand beyond surface (depth > 0)
+    3. Valley: In a productive valley (prose/philosophy, not just code)
+    """
+
+    def __init__(
+        self,
+        model: NyxModel,
+        surface_runs: int = 3,
+        echo_rounds: int = 3,
+        max_new_tokens: int = 50,
+    ):
+        self.model = model
+        self.surface_probe = SurfaceProbe(
+            model,
+            num_runs=surface_runs,
+            max_new_tokens=max_new_tokens,
+        )
+        self.echo_probe = EchoProbe(
+            model,
+            max_rounds=echo_rounds,
+            max_new_tokens=max_new_tokens,
+        )
+
+    def score(self, term: str) -> ReadinessResult:
+        """
+        Assess readiness of a term for curriculum.
+
+        Args:
+            term: Word or phrase to assess
+
+        Returns:
+            ReadinessResult with level, action, and supporting evidence
+        """
+        # Run both probes
+        surface = self.surface_probe.probe(term)
+        echo = self.echo_probe.probe(term)
+
+        # Classify valley from surface probe
+        classification = self.surface_probe.classify_completions(surface)
+        dominant_valley = classification['dominant']
+
+        # Calculate composite score
+        level, reasoning = self._calculate_level(
+            surface=surface,
+            echo=echo,
+            dominant_valley=dominant_valley,
+        )
+
+        return ReadinessResult(
+            term=term,
+            level=level,
+            action=ACTIONS[level],
+            surface=surface,
+            echo=echo,
+            reasoning=reasoning,
+        )
+
+    def _calculate_level(
+        self,
+        surface: SurfaceProbeResult,
+        echo: EchoProbeResult,
+        dominant_valley: str,
+    ) -> tuple[ReadinessLevel, str]:
+        """
+        Calculate readiness level based on probe results.
+
+        Heuristics:
+        - HIGH: depth >= 2 AND coherence >= 0.5 AND not pure code
+        - MEDIUM: depth >= 1 OR (coherence >= 0.5 AND prose/philosophy)
+        - LOW: everything else
+        """
+        depth = echo.depth
+        coherence = surface.coherence_score or 0.0
+        eos_ratio = surface.hit_eos_count / len(surface.completions) if surface.completions else 0
+
+        # Count echo types
+        expands = sum(1 for t in echo.echo_types if t == EchoType.EXPANDS)
+        collapses = sum(1 for t in echo.echo_types if t == EchoType.COLLAPSE)
+        circulars = sum(1 for t in echo.echo_types if t == EchoType.CIRCULAR)
+
+        # Build reasoning
+        reasons = []
+
+        # HIGH: Good depth + coherence + productive valley
+        if depth >= 2 and coherence >= 0.4:
+            if dominant_valley not in [CompletionCategory.CODE]:
+                reasons.append(f"depth={depth} (strong conceptual expansion)")
+                reasons.append(f"coherence={coherence:.2f} (consistent associations)")
+                reasons.append(f"valley={dominant_valley} (productive for training)")
+                return ReadinessLevel.HIGH, "; ".join(reasons)
+
+        # HIGH: Exceptional depth even with lower coherence
+        if depth >= 3:
+            reasons.append(f"depth={depth} (exceptional expansion)")
+            reasons.append(f"all {expands} echoes expand")
+            return ReadinessLevel.HIGH, "; ".join(reasons)
+
+        # MEDIUM: Some depth or good coherence in prose
+        if depth >= 1:
+            reasons.append(f"depth={depth} (some expansion capability)")
+            if dominant_valley in [CompletionCategory.PROSE, 'prose', 'definition']:
+                reasons.append(f"valley={dominant_valley} (trainable with scaffolding)")
+                return ReadinessLevel.MEDIUM, "; ".join(reasons)
+
+        if coherence >= 0.5 and dominant_valley not in [CompletionCategory.CODE, 'code']:
+            reasons.append(f"coherence={coherence:.2f} (consistent surface)")
+            reasons.append(f"valley={dominant_valley}")
+            reasons.append("but limited depth - needs bridging concepts")
+            return ReadinessLevel.MEDIUM, "; ".join(reasons)
+
+        # LOW: Trapped in code, circular, or incoherent
+        if dominant_valley in [CompletionCategory.CODE, 'code']:
+            reasons.append(f"valley=CODE (trapped in technical patterns)")
+        if circulars >= 2:
+            reasons.append(f"{circulars} circular echoes (surface-only knowledge)")
+        if collapses >= 1:
+            reasons.append(f"{collapses} collapses (unstable representations)")
+        if coherence < 0.4:
+            reasons.append(f"coherence={coherence:.2f} (scattered associations)")
+
+        return ReadinessLevel.LOW, "; ".join(reasons) if reasons else "insufficient depth and coherence"
+
+    def score_batch(self, terms: List[str]) -> List[ReadinessResult]:
+        """Score multiple terms."""
+        return [self.score(term) for term in terms]
+
+    def summary(self, result: ReadinessResult) -> str:
+        """Generate human-readable summary."""
+        symbols = {
+            ReadinessLevel.HIGH: "🟢",
+            ReadinessLevel.MEDIUM: "🟡",
+            ReadinessLevel.LOW: "🔴",
+        }
+
+        surface_summary = f"coherence={result.surface.coherence_score:.2f}" if result.surface else "N/A"
+        echo_summary = f"depth={result.echo.depth}" if result.echo else "N/A"
+
+        lines = [
+            f"{symbols[result.level]} {result.term}: {result.level.value}",
+            f"  Action: {result.action}",
+            f"  Surface: {surface_summary}",
+            f"  Echo: {echo_summary}",
+            f"  Reasoning: {result.reasoning}",
+        ]
+        return "\n".join(lines)
+
+    def curriculum_report(self, results: List[ReadinessResult]) -> str:
+        """Generate curriculum planning report."""
+        high = [r for r in results if r.level == ReadinessLevel.HIGH]
+        medium = [r for r in results if r.level == ReadinessLevel.MEDIUM]
+        low = [r for r in results if r.level == ReadinessLevel.LOW]
+
+        lines = [
+            "=" * 60,
+            "CURRICULUM READINESS REPORT",
+            "=" * 60,
+            "",
+            f"🟢 HIGH ({len(high)} terms) - Ready for state machine:",
+        ]
+        for r in high:
+            lines.append(f"    • {r.term}")
+
+        lines.extend([
+            "",
+            f"🟡 MEDIUM ({len(medium)} terms) - Need scaffolding:",
+        ])
+        for r in medium:
+            lines.append(f"    • {r.term}: {r.reasoning[:60]}...")
+
+        lines.extend([
+            "",
+            f"🔴 LOW ({len(low)} terms) - Require foundational work:",
+        ])
+        for r in low:
+            lines.append(f"    • {r.term}: {r.reasoning[:60]}...")
+
+        lines.extend([
+            "",
+            "=" * 60,
+            f"Summary: {len(high)}/{len(results)} ready, {len(medium)} scaffolding, {len(low)} foundational",
+            "=" * 60,
+        ])
+
+        return "\n".join(lines)