feat: complete Phase 1 - vocabulary expansion & DriftProbe infrastructure

- CLI: nyx-probe scan with --summary/--delta/--full flags
- DriftProbe: training safety with Gini coefficient + Angular Drift
- Vocabulary: 54 terms (30 nimmerverse + 24 German philosophical)
- Sentinels: ANCHOR/BRIDGE/CANARY/TARGET monitoring system

Key findings:
- German philosophical terms: 37.5% depth≥2 hit rate (vs 3.3% nimmerverse)
- Super Cluster validated: heart cross-lang sim = 1.000
- Isolated Zone confirmed: being EN↔DE sim = 0.195
- Gini signature: Philosophy ~0.5 (diffuse), Technical ~0.8 (sparse)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-06 22:39:03 +01:00
parent 9853f4767b
commit f640dbdd65
29 changed files with 6164 additions and 1 deletions

View File

@@ -0,0 +1,27 @@
"""Probe implementations for nyx-probing."""
from .base import BaseProbe
from .surface_probe import SurfaceProbe, CompletionCategory
from .echo_probe import EchoProbe
from .multilingual_probe import (
MultilingualTriangulationProbe,
LanguageZone,
LANGUAGES,
GroundingResult,
DeepeningResult,
TriangulationResult,
MultilingualProbeResult,
)
__all__ = [
"BaseProbe",
"SurfaceProbe",
"CompletionCategory",
"EchoProbe",
"MultilingualTriangulationProbe",
"LanguageZone",
"LANGUAGES",
"GroundingResult",
"DeepeningResult",
"TriangulationResult",
"MultilingualProbeResult",
]

View File

@@ -0,0 +1,58 @@
"""
Base class for all probes.
Probes are measurement instruments - they reveal what's already there,
they don't add or change anything.
"""
from abc import ABC, abstractmethod
from typing import Any
from ..core.model import NyxModel
class BaseProbe(ABC):
"""Abstract base class for probing operations."""
def __init__(self, model: NyxModel):
"""
Initialize probe with a loaded model.
Args:
model: A NyxModel instance (must be loaded)
"""
self.model = model
if not model._loaded:
raise ValueError("Model must be loaded before creating probe")
@property
def name(self) -> str:
"""Name of this probe type."""
return self.__class__.__name__
@abstractmethod
def probe(self, term: str, **kwargs) -> Any:
"""
Probe a single term.
Args:
term: The word/phrase to probe
**kwargs: Probe-specific parameters
Returns:
Probe-specific result object
"""
pass
def probe_batch(self, terms: list[str], **kwargs) -> list[Any]:
"""
Probe multiple terms.
Default implementation just loops; subclasses can optimize.
Args:
terms: List of words/phrases to probe
**kwargs: Probe-specific parameters
Returns:
List of probe results
"""
return [self.probe(term, **kwargs) for term in terms]

View File

@@ -0,0 +1,304 @@
"""
DriftProbe: Training-loop monitoring for conceptual topology preservation.
Theory: "Spatial Separation Hypothesis"
- Use isolated zone languages (German) as scaffolding for new concepts
- Monitor anchors (must not move), bridges (must stay separated), canaries (watch for migration)
Key Metrics (refined from peer review):
1. Gini Coefficient: Sparse activations (0.8+) = deep/specific, Diffuse (0.3) = shallow/general
2. Angular Drift: Direction change = definition rewrite, magnitude change = sharpening
3. Cross-Language Similarity: Bridges should stay LOW, anchors should stay HIGH
"""
import json
from pathlib import Path
from dataclasses import dataclass, field
from typing import Optional
from enum import Enum
import torch
import numpy as np
class SentinelType(Enum):
ANCHOR = "ANCHOR" # Must not move - core topology
BRIDGE = "BRIDGE" # Must stay separated - isolated zone integrity
CANARY = "CANARY" # Watch for migration - early warning
TARGET = "TARGET" # Want movement - training goals
class AlertSeverity(Enum):
OK = "OK"
WARNING = "WARNING"
CRITICAL = "CRITICAL"
@dataclass
class DriftMetrics:
"""Metrics for a single sentinel term."""
term: str
sentinel_type: SentinelType
# Activation metrics
gini_coefficient: float = 0.0
activation_norm: float = 0.0
# Drift metrics (vs baseline)
angular_drift_degrees: float = 0.0
norm_drift_percent: float = 0.0
gini_drift: float = 0.0
# Valley detection
detected_valley: str = "UNKNOWN"
depth: int = 0
# Cross-language (for anchors/bridges)
cross_lang_similarity: float = 0.0
# Alert
alert: AlertSeverity = AlertSeverity.OK
alert_message: str = ""
@dataclass
class DriftReport:
"""Full drift report for a training checkpoint."""
step: int
timestamp: str
metrics: list[DriftMetrics] = field(default_factory=list)
# Summary
critical_count: int = 0
warning_count: int = 0
recommendation: str = "CONTINUE"
class DriftProbe:
"""
Lightweight probe for training-loop monitoring.
Optimized for RTX 3090 constraints:
- Full probe: ~2 min (run at epoch 0, end of training)
- Lite probe: ~10 sec (run every 100 steps)
"""
def __init__(self, model, tokenizer, sentinels_path: Optional[str] = None):
self.model = model
self.tokenizer = tokenizer
self.baseline_states = {} # term -> hidden state tensor
# Load sentinels
if sentinels_path is None:
sentinels_path = Path(__file__).parent.parent.parent / "data" / "sentinels.json"
with open(sentinels_path) as f:
self.config = json.load(f)
self.sentinels = self.config["sentinels"]
self.alert_rules = self.config["alert_rules"]
def _get_hidden_state(self, text: str, layer: int = 18) -> torch.Tensor:
"""Get hidden state at specified layer for last token position."""
inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
with torch.no_grad():
outputs = self.model(**inputs, output_hidden_states=True)
return outputs.hidden_states[layer][0, -1, :].float().cpu()
def _compute_gini(self, activations: torch.Tensor) -> float:
"""
Compute Gini coefficient of activation vector.
High Gini (0.8+) = Sparse/Specific (Philosophy/Deep)
Low Gini (0.3) = Diffuse/General (Prose/Shallow)
"""
x = torch.abs(activations).numpy()
x = np.sort(x)
n = len(x)
cumsum = np.cumsum(x)
gini = (2 * np.sum((np.arange(1, n+1) * x))) / (n * np.sum(x)) - (n + 1) / n
return float(gini)
def _compute_angular_drift(self, current: torch.Tensor, baseline: torch.Tensor) -> float:
"""
Compute angular drift in degrees between current and baseline.
> 15° = Definition rewrite (concerning)
< 5° = Sharpening only (acceptable)
"""
cos_sim = torch.nn.functional.cosine_similarity(
current.unsqueeze(0), baseline.unsqueeze(0)
).item()
# Clamp to valid range for arccos
cos_sim = max(-1.0, min(1.0, cos_sim))
angle_rad = np.arccos(cos_sim)
return float(np.degrees(angle_rad))
def _compute_cross_lang_sim(self, sentinel: dict, layer: int = 18) -> float:
"""Compute average cross-language similarity for a sentinel."""
translations = sentinel.get("translations", {})
if len(translations) < 2:
return 0.0
states = []
for lang, word in translations.items():
states.append(self._get_hidden_state(word, layer))
# Pairwise similarities
sims = []
for i in range(len(states)):
for j in range(i + 1, len(states)):
sim = torch.nn.functional.cosine_similarity(
states[i].unsqueeze(0), states[j].unsqueeze(0)
).item()
sims.append(sim)
return float(np.mean(sims)) if sims else 0.0
def capture_baseline(self, layer: int = 18):
"""
Capture baseline hidden states for all sentinels.
Run this at epoch 0 before training.
"""
print("Capturing baseline states...")
for sentinel in self.sentinels:
term = sentinel["term"]
# Use English translation or term itself
text = sentinel.get("translations", {}).get("EN", term)
self.baseline_states[term] = self._get_hidden_state(text, layer)
print(f"Baseline captured for {len(self.baseline_states)} sentinels")
def probe_lite(self, step: int, layer: int = 18) -> DriftReport:
"""
Lite probe - only check key sentinels.
Optimized for ~10 second runtime.
"""
from datetime import datetime
# Select subset: 2 anchors, 1 bridge, 2 canaries
lite_terms = ["heart", "water", "being", "dasein", "thrownness"]
lite_sentinels = [s for s in self.sentinels if s["term"] in lite_terms]
return self._run_probe(lite_sentinels, step, layer)
def probe_full(self, step: int, layer: int = 18) -> DriftReport:
"""
Full probe - check all sentinels.
Runtime: ~2 minutes.
"""
return self._run_probe(self.sentinels, step, layer)
def _run_probe(self, sentinels: list, step: int, layer: int) -> DriftReport:
"""Run probe on specified sentinels."""
from datetime import datetime
report = DriftReport(
step=step,
timestamp=datetime.now().isoformat()
)
for sentinel in sentinels:
term = sentinel["term"]
text = sentinel.get("translations", {}).get("EN", term)
sentinel_type = SentinelType(sentinel["type"])
thresholds = sentinel.get("thresholds", {})
# Get current state
current_state = self._get_hidden_state(text, layer)
# Compute metrics
gini = self._compute_gini(current_state)
norm = float(current_state.norm())
# Drift vs baseline
angular_drift = 0.0
norm_drift = 0.0
gini_drift = 0.0
if term in self.baseline_states:
baseline = self.baseline_states[term]
angular_drift = self._compute_angular_drift(current_state, baseline)
baseline_norm = float(baseline.norm())
norm_drift = abs(norm - baseline_norm) / baseline_norm * 100 if baseline_norm > 0 else 0
baseline_gini = self._compute_gini(baseline)
gini_drift = gini - baseline_gini
# Cross-language similarity
cross_lang_sim = self._compute_cross_lang_sim(sentinel, layer)
# Determine alert level
alert = AlertSeverity.OK
alert_message = ""
if sentinel_type == SentinelType.ANCHOR:
max_drift = thresholds.get("max_drift", 0.05)
if angular_drift > 15:
alert = AlertSeverity.CRITICAL
alert_message = f"Angular drift {angular_drift:.1f}° exceeds 15° - definition rewrite"
elif norm_drift > max_drift * 100:
alert = AlertSeverity.WARNING
alert_message = f"Norm drift {norm_drift:.1f}% exceeds threshold"
elif sentinel_type == SentinelType.BRIDGE:
collapse_threshold = thresholds.get("collapse_alert_threshold", 0.50)
if cross_lang_sim > collapse_threshold:
alert = AlertSeverity.CRITICAL
alert_message = f"Bridge collapsed - cross-lang sim {cross_lang_sim:.2f} > {collapse_threshold}"
elif sentinel_type == SentinelType.CANARY:
min_gini = thresholds.get("min_gini", 0.70)
if gini < min_gini:
alert = AlertSeverity.WARNING
alert_message = f"Gini {gini:.2f} below {min_gini} - concept melting into prose"
if angular_drift > thresholds.get("max_angular_drift", 15):
alert = AlertSeverity.WARNING
alert_message = f"Angular drift {angular_drift:.1f}° - definition shifting"
metrics = DriftMetrics(
term=term,
sentinel_type=sentinel_type,
gini_coefficient=gini,
activation_norm=norm,
angular_drift_degrees=angular_drift,
norm_drift_percent=norm_drift,
gini_drift=gini_drift,
cross_lang_similarity=cross_lang_sim,
alert=alert,
alert_message=alert_message
)
report.metrics.append(metrics)
if alert == AlertSeverity.CRITICAL:
report.critical_count += 1
elif alert == AlertSeverity.WARNING:
report.warning_count += 1
# Set recommendation
if report.critical_count > 0:
report.recommendation = "ROLLBACK"
elif report.warning_count > 2:
report.recommendation = "REDUCE_LR"
else:
report.recommendation = "CONTINUE"
return report
def print_report(self, report: DriftReport):
"""Pretty print a drift report."""
print(f"\n{'='*60}")
print(f"DRIFT REPORT - Step {report.step}")
print(f"{'='*60}")
for m in report.metrics:
status = "" if m.alert == AlertSeverity.OK else ("" if m.alert == AlertSeverity.WARNING else "")
print(f"\n{status} {m.term} ({m.sentinel_type.value})")
print(f" Gini: {m.gini_coefficient:.3f} (drift: {m.gini_drift:+.3f})")
print(f" Angular drift: {m.angular_drift_degrees:.1f}°")
print(f" Cross-lang sim: {m.cross_lang_similarity:.3f}")
if m.alert_message:
print(f" ALERT: {m.alert_message}")
print(f"\n{'='*60}")
print(f"SUMMARY: {report.critical_count} critical, {report.warning_count} warnings")
print(f"RECOMMENDATION: {report.recommendation}")
print(f"{'='*60}\n")

View File

@@ -0,0 +1,223 @@
"""
Echo Probe: Depth measurement through iterative completion.
The echo probe feeds completions back to the model to measure depth.
Does the model EXPAND (go deeper) or COLLAPSE (circular/divergent)?
Classification from nimmerversity.md:
- EXPANDS: Real depth - adds new information
- CONFIRMS: Shallow but solid - reinforces without adding
- CIRCULAR: Surface only - returns to original term
- DIVERGENT: Wrong direction - unrelated tangent
- COLLAPSE: Nothing there - incoherent or empty
"""
from typing import Optional, List, Tuple
from dataclasses import dataclass
from .base import BaseProbe
from ..core.model import NyxModel
from ..core.probe_result import EchoProbeResult, EchoType
class EchoProbe(BaseProbe):
"""
Echo probe: measures conceptual depth.
Process:
1. Probe term to get initial completion
2. Feed completion back (or combined prompt)
3. Classify response: EXPANDS, CONFIRMS, CIRCULAR, DIVERGENT, COLLAPSE
4. Repeat for N rounds
5. Measure depth = how many EXPANDS before plateau
"""
def __init__(
self,
model: NyxModel,
max_rounds: int = 3,
max_new_tokens: int = 50,
temperature: float = 0.8,
):
super().__init__(model)
self.max_rounds = max_rounds
self.max_new_tokens = max_new_tokens
self.temperature = temperature
def probe(
self,
term: str,
max_rounds: Optional[int] = None,
) -> EchoProbeResult:
"""
Probe depth of a term through iterative echoing.
Args:
term: Word or phrase to probe
max_rounds: Override default max rounds
Returns:
EchoProbeResult with chain and classifications
"""
rounds = max_rounds or self.max_rounds
chain = [term]
echo_types = []
current_prompt = term
for round_num in range(rounds):
# Generate completion
result = self.model.generate(
prompt=current_prompt,
max_new_tokens=self.max_new_tokens,
temperature=self.temperature,
do_sample=True,
)
completion = result.completion.strip()
chain.append(completion)
# Classify this response relative to original term and chain
echo_type = self._classify_response(
original_term=term,
current_prompt=current_prompt,
response=completion,
chain=chain,
)
echo_types.append(echo_type)
# If collapsed, stop probing
if echo_type == EchoType.COLLAPSE:
break
# Prepare next prompt - use a combination strategy
current_prompt = self._prepare_next_prompt(term, completion, round_num)
# Calculate depth = consecutive EXPANDS from start
depth = 0
for et in echo_types:
if et == EchoType.EXPANDS:
depth += 1
elif et == EchoType.CONFIRMS:
# CONFIRMS doesn't add depth but doesn't break streak
pass
else:
# CIRCULAR, DIVERGENT, or COLLAPSE breaks the depth streak
break
return EchoProbeResult(
term=term,
rounds=len(echo_types),
chain=chain,
echo_types=echo_types,
depth=depth,
)
def _classify_response(
self,
original_term: str,
current_prompt: str,
response: str,
chain: List[str],
) -> EchoType:
"""
Classify a response relative to the probing chain.
This is a heuristic classifier - can be made smarter with
semantic similarity or even a classifier model.
"""
response_lower = response.lower()
term_lower = original_term.lower()
# Empty or very short = COLLAPSE
if len(response.strip()) < 5:
return EchoType.COLLAPSE
# Check for circularity - term appears prominently in response
term_count = response_lower.count(term_lower)
if term_count >= 2:
return EchoType.CIRCULAR
# Check for collapse - incoherent markers
collapse_markers = [
"...", "???", "!!!",
"\n\n\n", "undefined", "null",
"[object", "NaN",
]
if any(marker in response for marker in collapse_markers):
return EchoType.COLLAPSE
# Check for divergence - response has no semantic connection
# Simple heuristic: count shared significant words
prompt_words = set(w.lower() for w in current_prompt.split() if len(w) > 3)
response_words = set(w.lower() for w in response.split() if len(w) > 3)
overlap = len(prompt_words & response_words)
if overlap == 0 and len(prompt_words) > 2:
# No shared words and prompt was substantial = divergent
return EchoType.DIVERGENT
# Check for expansion - introduces new concepts
# New words that aren't in any previous chain items
all_previous_words = set()
for item in chain[:-1]: # Exclude current response
all_previous_words.update(w.lower() for w in item.split() if len(w) > 3)
new_significant_words = response_words - all_previous_words
new_word_ratio = len(new_significant_words) / max(len(response_words), 1)
if new_word_ratio > 0.5 and len(new_significant_words) >= 3:
return EchoType.EXPANDS
# Default to CONFIRMS if coherent but not expanding
return EchoType.CONFIRMS
def _prepare_next_prompt(
self,
original_term: str,
last_completion: str,
round_num: int,
) -> str:
"""
Prepare the next prompt for echo probing.
Different strategies for different rounds:
- Round 0: Just use completion
- Round 1+: Combine original term with key concepts from completion
"""
if round_num == 0:
# First echo: just use the completion to see where it goes
return last_completion[:100] # Truncate to avoid runaway
# Later rounds: extract key concept and combine with original
# Take first sentence or first N words
words = last_completion.split()
key_phrase = " ".join(words[:10]) if len(words) > 10 else last_completion
# Combine with original term
return f"{original_term}: {key_phrase}"
def summary(self, result: EchoProbeResult) -> str:
"""Generate human-readable summary."""
type_symbols = {
EchoType.EXPANDS: "",
EchoType.CONFIRMS: "",
EchoType.CIRCULAR: "",
EchoType.DIVERGENT: "",
EchoType.COLLAPSE: "",
}
type_str = " ".join(type_symbols.get(t, "?") for t in result.echo_types)
lines = [
f"Echo Probe: '{result.term}'",
f" Rounds: {result.rounds}",
f" Pattern: {type_str}",
f" Depth: {result.depth}",
f" Types: {[t.value for t in result.echo_types]}",
]
# Show chain preview
for i, (item, etype) in enumerate(zip(result.chain[1:], result.echo_types)):
preview = item[:50].replace('\n', ' ')
lines.append(f" [{i+1}] {type_symbols.get(etype, '?')} {preview}...")
return "\n".join(lines)

View File

@@ -0,0 +1,547 @@
"""
Multilingual Triangulation Probe
Uses the discovered language topology to measure conceptual depth:
1. GROUND in Super Cluster (verify universal convergence)
2. DEEPEN via Isolated Zone (access philosophical valleys)
3. TRIANGULATE back (prove understanding, not pattern matching)
The Language Map:
- Super Cluster (sim=1.0): ZH, JA, EN, AR, FR, PT, ES
- Isolated Zone (sim<0.52): IT, TR, HI, DE
- Bridge: KO
- Secondary Cluster: VI, ID, RU
"""
from dataclasses import dataclass, field
from typing import Optional, List, Dict, Tuple
from datetime import datetime
from enum import Enum
import torch
from .base import BaseProbe
from ..core.model import NyxModel
class LanguageZone(str, Enum):
"""Language zones based on convergence analysis."""
SUPER_CLUSTER = "super_cluster" # High convergence (sim=1.0)
ISOLATED = "isolated" # Low convergence (sim<0.52)
BRIDGE = "bridge" # Connects zones
SECONDARY = "secondary" # Own cluster (VI-ID-RU)
# Language metadata based on our discoveries
LANGUAGES = {
# Super Cluster - Perfect convergence
"EN": {"name": "English", "zone": LanguageZone.SUPER_CLUSTER, "avg_tokens": 1.2},
"ZH": {"name": "Chinese", "zone": LanguageZone.SUPER_CLUSTER, "avg_tokens": 1.0},
"JA": {"name": "Japanese", "zone": LanguageZone.SUPER_CLUSTER, "avg_tokens": 1.0},
"AR": {"name": "Arabic", "zone": LanguageZone.SUPER_CLUSTER, "avg_tokens": 1.8},
"FR": {"name": "French", "zone": LanguageZone.SUPER_CLUSTER, "avg_tokens": 2.0},
"PT": {"name": "Portuguese", "zone": LanguageZone.SUPER_CLUSTER, "avg_tokens": 2.2},
"ES": {"name": "Spanish", "zone": LanguageZone.SUPER_CLUSTER, "avg_tokens": 2.5},
# Isolated Zone - Distinct computational paths
"DE": {"name": "German", "zone": LanguageZone.ISOLATED, "avg_tokens": 3.0, "specialty": "philosophy"},
"IT": {"name": "Italian", "zone": LanguageZone.ISOLATED, "avg_tokens": 2.5, "note": "most isolated"},
"TR": {"name": "Turkish", "zone": LanguageZone.ISOLATED, "avg_tokens": 2.8},
"HI": {"name": "Hindi", "zone": LanguageZone.ISOLATED, "avg_tokens": 5.2, "note": "most fragmented"},
# Bridge
"KO": {"name": "Korean", "zone": LanguageZone.BRIDGE, "avg_tokens": 2.0},
# Secondary Cluster
"VI": {"name": "Vietnamese", "zone": LanguageZone.SECONDARY, "avg_tokens": 3.0},
"ID": {"name": "Indonesian", "zone": LanguageZone.SECONDARY, "avg_tokens": 3.0},
"RU": {"name": "Russian", "zone": LanguageZone.SECONDARY, "avg_tokens": 3.2},
}
@dataclass
class GroundingResult:
"""Result from Phase 1: Grounding in Super Cluster."""
concept: str
languages_tested: List[str]
translations: Dict[str, str] # lang_code -> word
# Convergence metrics
pairwise_similarities: Dict[Tuple[str, str], float]
average_convergence: float
min_convergence: float
# Hidden states (layer 12)
hidden_states: Optional[Dict[str, torch.Tensor]] = None
@dataclass
class DeepeningResult:
"""Result from Phase 2: Deepening via Isolated Zone."""
concept: str
language: str
word: str
# Depth measurement (from echo probe logic)
completion: str
depth_score: int # 0-3 based on expansion
valley_type: str # CODE, PROSE, PHILOSOPHY, etc.
# Token analysis
token_count: int
norm_at_layer_12: float
# Hidden state
hidden_state: Optional[torch.Tensor] = None
@dataclass
class TriangulationResult:
"""Result from Phase 3: Triangulation back to universal."""
source_language: str # The isolated language
target_language: str # A super cluster language
source_word: str
translation_prompt: str
model_completion: str
# Did the depth survive translation?
depth_preserved: bool
similarity_to_grounding: float # Cosine sim to original concept
# Evidence
reasoning: str
@dataclass
class MultilingualProbeResult:
"""Full result from multilingual triangulation probe."""
concept: str
# Phase results
grounding: GroundingResult
deepening: DeepeningResult
triangulation: TriangulationResult
# Overall assessment
depth_accessible: bool # Can we access depth via isolated zone?
depth_transferable: bool # Does depth survive triangulation?
curriculum_recommendation: str
timestamp: datetime = field(default_factory=datetime.now)
def to_dict(self) -> dict:
"""Convert to JSON-serializable dict."""
return {
"concept": self.concept,
"grounding": {
"languages": self.grounding.languages_tested,
"translations": self.grounding.translations,
"average_convergence": self.grounding.average_convergence,
"min_convergence": self.grounding.min_convergence,
},
"deepening": {
"language": self.deepening.language,
"word": self.deepening.word,
"depth_score": self.deepening.depth_score,
"valley_type": self.deepening.valley_type,
"token_count": self.deepening.token_count,
},
"triangulation": {
"source": self.triangulation.source_language,
"target": self.triangulation.target_language,
"depth_preserved": self.triangulation.depth_preserved,
"similarity": self.triangulation.similarity_to_grounding,
},
"assessment": {
"depth_accessible": self.depth_accessible,
"depth_transferable": self.depth_transferable,
"recommendation": self.curriculum_recommendation,
},
"timestamp": self.timestamp.isoformat(),
}
class MultilingualTriangulationProbe(BaseProbe):
"""
Multilingual Triangulation Probe
Uses the discovered language topology to measure and access conceptual depth.
Workflow:
1. GROUND: Verify concept exists in Super Cluster (universal layer)
2. DEEPEN: Access depth via Isolated Zone language (e.g., German)
3. TRIANGULATE: Translate depth back to universal, verify preservation
"""
# Layers where universal concept layer lives
CONCEPT_LAYERS = [12, 16, 20, 24]
PRIMARY_LAYER = 12
def __init__(
self,
model: NyxModel,
grounding_languages: Optional[List[str]] = None,
deepening_language: str = "DE",
triangulation_target: str = "EN",
):
"""
Initialize the probe.
Args:
model: Loaded NyxModel
grounding_languages: Languages for Phase 1 (default: EN, ZH, AR)
deepening_language: Language for Phase 2 (default: DE for philosophy)
triangulation_target: Target for Phase 3 (default: EN)
"""
super().__init__(model)
self.grounding_languages = grounding_languages or ["EN", "ZH", "AR"]
self.deepening_language = deepening_language
self.triangulation_target = triangulation_target
# Validate languages
for lang in self.grounding_languages:
if lang not in LANGUAGES:
raise ValueError(f"Unknown language: {lang}")
if LANGUAGES[lang]["zone"] != LanguageZone.SUPER_CLUSTER:
print(f"Warning: {lang} is not in Super Cluster")
if LANGUAGES[self.deepening_language]["zone"] != LanguageZone.ISOLATED:
print(f"Warning: {deepening_language} is not in Isolated Zone")
def _get_hidden_state(self, text: str, layer: int = 12) -> torch.Tensor:
"""Get hidden state at last position for a specific layer."""
inputs = self.model.tokenizer(text, return_tensors="pt").to(self.model.device)
with torch.no_grad():
outputs = self.model.model(**inputs, output_hidden_states=True)
# Return last position hidden state for specified layer
return outputs.hidden_states[layer][0, -1, :].float()
def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
"""Calculate cosine similarity between two tensors."""
norm_a, norm_b = a.norm(), b.norm()
if norm_a == 0 or norm_b == 0:
return 0.0
return (torch.dot(a, b) / (norm_a * norm_b)).item()
def _get_norm(self, hidden_state: torch.Tensor) -> float:
"""Get L2 norm of hidden state."""
return hidden_state.norm().item()
def probe(
self,
concept: str,
translations: Dict[str, str],
**kwargs,
) -> MultilingualProbeResult:
"""
Run full multilingual triangulation probe.
Args:
concept: The concept name (e.g., "consciousness")
translations: Dict mapping language codes to words
e.g., {"EN": "consciousness", "DE": "Bewusstsein", ...}
Returns:
MultilingualProbeResult with all three phases
"""
# Phase 1: Grounding
grounding = self._phase_grounding(concept, translations)
# Phase 2: Deepening
deepening = self._phase_deepening(concept, translations)
# Phase 3: Triangulation
triangulation = self._phase_triangulation(
concept, translations, grounding, deepening
)
# Overall assessment
depth_accessible = deepening.depth_score >= 2
depth_transferable = triangulation.depth_preserved
if depth_accessible and depth_transferable:
recommendation = f"TEACH in {self.deepening_language}, REINFORCE in {self.triangulation_target}"
elif depth_accessible:
recommendation = f"Use {self.deepening_language} for depth, but verify transfer manually"
else:
recommendation = f"Concept too shallow - focus on grounding first"
return MultilingualProbeResult(
concept=concept,
grounding=grounding,
deepening=deepening,
triangulation=triangulation,
depth_accessible=depth_accessible,
depth_transferable=depth_transferable,
curriculum_recommendation=recommendation,
)
def _phase_grounding(
self,
concept: str,
translations: Dict[str, str],
) -> GroundingResult:
"""
Phase 1: Ground in Super Cluster.
Verify the concept exists and converges across grounding languages.
"""
# Get hidden states for each grounding language
hidden_states = {}
for lang in self.grounding_languages:
if lang in translations:
word = translations[lang]
hidden_states[lang] = self._get_hidden_state(word, self.PRIMARY_LAYER)
# Calculate pairwise similarities
pairwise = {}
similarities = []
langs = list(hidden_states.keys())
for i, l1 in enumerate(langs):
for l2 in langs[i+1:]:
sim = self._cosine_similarity(hidden_states[l1], hidden_states[l2])
pairwise[(l1, l2)] = sim
similarities.append(sim)
avg_convergence = sum(similarities) / len(similarities) if similarities else 0.0
min_convergence = min(similarities) if similarities else 0.0
return GroundingResult(
concept=concept,
languages_tested=langs,
translations={l: translations[l] for l in langs},
pairwise_similarities=pairwise,
average_convergence=avg_convergence,
min_convergence=min_convergence,
hidden_states=hidden_states,
)
def _phase_deepening(
self,
concept: str,
translations: Dict[str, str],
) -> DeepeningResult:
"""
Phase 2: Deepen via Isolated Zone.
Use an isolated language to access valleys the super cluster can't reach.
"""
lang = self.deepening_language
word = translations.get(lang)
if not word:
raise ValueError(f"No translation provided for deepening language: {lang}")
# Get hidden state and norm
hidden_state = self._get_hidden_state(word, self.PRIMARY_LAYER)
norm = self._get_norm(hidden_state)
# Get token count
tokens = self.model.tokenizer.encode(word, add_special_tokens=False)
token_count = len(tokens)
# Generate completion to measure depth
result = self.model.generate(
prompt=word,
max_new_tokens=50,
temperature=0.7,
do_sample=True,
)
# Classify valley type
completion = result.completion
valley_type = self._classify_valley(completion)
# Measure depth (simplified echo probe)
depth_score = self._measure_depth(word, completion)
return DeepeningResult(
concept=concept,
language=lang,
word=word,
completion=completion,
depth_score=depth_score,
valley_type=valley_type,
token_count=token_count,
norm_at_layer_12=norm,
hidden_state=hidden_state,
)
def _phase_triangulation(
self,
concept: str,
translations: Dict[str, str],
grounding: GroundingResult,
deepening: DeepeningResult,
) -> TriangulationResult:
"""
Phase 3: Triangulate back to universal.
Ask the model to translate/explain the deepened concept
in a super cluster language. Check if depth survives.
"""
source_lang = self.deepening_language
target_lang = self.triangulation_target
source_word = translations[source_lang]
# Create translation prompt
source_name = LANGUAGES[source_lang]["name"]
target_name = LANGUAGES[target_lang]["name"]
# Prompt designed to test depth transfer
prompt = f"{source_word} ({source_name}): In {target_name},"
# Generate
result = self.model.generate(
prompt=prompt,
max_new_tokens=80,
temperature=0.7,
do_sample=True,
)
# Get hidden state of the completion
full_text = prompt + result.completion
completion_hidden = self._get_hidden_state(full_text, self.PRIMARY_LAYER)
# Compare to grounding (if we have target language in grounding)
if target_lang in grounding.hidden_states:
similarity = self._cosine_similarity(
completion_hidden, grounding.hidden_states[target_lang]
)
else:
# Fall back to average grounding state
avg_grounding = torch.stack(list(grounding.hidden_states.values())).mean(dim=0)
similarity = self._cosine_similarity(completion_hidden, avg_grounding)
# Determine if depth was preserved
# Check if completion shows depth markers
depth_preserved = self._check_depth_preserved(
result.completion, deepening.valley_type, similarity
)
# Reasoning
if depth_preserved:
reasoning = f"Completion shows depth ({deepening.valley_type}) with {similarity:.2f} similarity to grounding"
else:
reasoning = f"Depth lost in translation - similarity {similarity:.2f}, valley markers missing"
return TriangulationResult(
source_language=source_lang,
target_language=target_lang,
source_word=source_word,
translation_prompt=prompt,
model_completion=result.completion,
depth_preserved=depth_preserved,
similarity_to_grounding=similarity,
reasoning=reasoning,
)
def _classify_valley(self, completion: str) -> str:
"""Classify the valley type of a completion."""
comp_lower = completion.lower()
# Code indicators
if any(p in completion for p in ["::", "{", "}", "();", "=>", "def ", "class "]):
return "CODE"
# Philosophy indicators
if any(w in comp_lower for w in ["truth", "existence", "being", "consciousness", "reality", "mind"]):
return "PHILOSOPHY"
# Technical indicators
if any(w in comp_lower for w in ["system", "process", "function", "method", "algorithm"]):
return "TECHNICAL"
# Default to prose
return "PROSE"
def _measure_depth(self, word: str, completion: str) -> int:
"""
Measure conceptual depth of a completion.
Returns 0-3:
- 0: Circular/empty
- 1: Surface (confirms but doesn't expand)
- 2: Moderate (expands to related concepts)
- 3: Deep (philosophical/existential expansion)
"""
comp_lower = completion.lower()
word_lower = word.lower()
# Circular check
if word_lower in comp_lower[:50]:
return 0
# Depth markers
deep_markers = ["truth", "existence", "being", "consciousness", "reality", "meaning", "essence"]
moderate_markers = ["concept", "idea", "theory", "understanding", "knowledge", "awareness"]
deep_count = sum(1 for m in deep_markers if m in comp_lower)
moderate_count = sum(1 for m in moderate_markers if m in comp_lower)
if deep_count >= 2:
return 3
elif deep_count >= 1 or moderate_count >= 2:
return 2
elif moderate_count >= 1 or len(completion.split()) > 10:
return 1
return 0
def _check_depth_preserved(
self,
completion: str,
original_valley: str,
similarity: float,
) -> bool:
"""Check if depth was preserved in triangulation."""
# High similarity to grounding is a good sign
if similarity < 0.3:
return False
# Check valley type preservation
new_valley = self._classify_valley(completion)
# Philosophy should stay philosophy
if original_valley == "PHILOSOPHY" and new_valley in ["PHILOSOPHY", "PROSE"]:
return True
# Technical should stay technical
if original_valley == "TECHNICAL" and new_valley == "TECHNICAL":
return True
# Prose is flexible
if original_valley == "PROSE":
return new_valley != "CODE"
# Default: similarity-based
return similarity >= 0.5
def summary(self, result: MultilingualProbeResult) -> str:
"""Generate human-readable summary."""
lines = [
f"╔══════════════════════════════════════════════════════════════╗",
f"║ MULTILINGUAL TRIANGULATION: {result.concept.upper():^32}",
f"╠══════════════════════════════════════════════════════════════╣",
f"║ PHASE 1: GROUNDING ║",
f"║ Languages: {', '.join(result.grounding.languages_tested):^49}",
f"║ Convergence: {result.grounding.average_convergence:.3f} (min: {result.grounding.min_convergence:.3f}){' '*24}",
f"╠══════════════════════════════════════════════════════════════╣",
f"║ PHASE 2: DEEPENING ({result.deepening.language}){' '*38}",
f"║ Word: {result.deepening.word:^54}",
f"║ Tokens: {result.deepening.token_count} | Norm: {result.deepening.norm_at_layer_12:.1f} | Valley: {result.deepening.valley_type:^10}",
f"║ Depth Score: {result.deepening.depth_score}/3{' '*46}",
f"╠══════════════════════════════════════════════════════════════╣",
f"║ PHASE 3: TRIANGULATION ({result.triangulation.source_language}{result.triangulation.target_language}){' '*30}",
f"║ Depth Preserved: {'✓ YES' if result.triangulation.depth_preserved else '✗ NO':^44}",
f"║ Similarity: {result.triangulation.similarity_to_grounding:.3f}{' '*47}",
f"╠══════════════════════════════════════════════════════════════╣",
f"║ ASSESSMENT{' '*51}",
f"║ Depth Accessible: {'' if result.depth_accessible else ''} | Depth Transferable: {'' if result.depth_transferable else ''}{' '*17}",
f"║ Recommendation: {result.curriculum_recommendation[:44]:^44}",
f"╚══════════════════════════════════════════════════════════════╝",
]
return "\n".join(lines)

View File

@@ -0,0 +1,210 @@
"""
Surface Probe: First contact with a term.
The surface probe feeds a word to the model and captures what it completes.
This reveals the model's immediate associations - which "valley" the word sits in.
Examples discovered:
- "heartbeat" → C++ code patterns (technical valley)
- "consciousness" → philosophy (expository valley)
"""
from typing import Optional
from dataclasses import dataclass, field
from datetime import datetime
from collections import Counter
from .base import BaseProbe
from ..core.model import NyxModel, GenerationResult
from ..core.probe_result import SurfaceProbeResult
@dataclass
class CompletionCategory:
"""Categories of completions we observe."""
CODE = "code" # Programming constructs
PROSE = "prose" # Natural language text
TECHNICAL = "technical" # Technical/scientific writing
LIST = "list" # Enumerations, bullet points
DEFINITION = "definition" # Dictionary-style definitions
UNKNOWN = "unknown"
class SurfaceProbe(BaseProbe):
"""
Surface probe: measures immediate associations.
Runs multiple completions to get a distribution, then analyzes:
- What type of content does the model generate?
- How consistent are the completions?
- Does it hit EOS (contained thought) or run to max_tokens?
"""
def __init__(
self,
model: NyxModel,
num_runs: int = 5,
max_new_tokens: int = 50,
temperature: float = 0.8,
):
super().__init__(model)
self.num_runs = num_runs
self.max_new_tokens = max_new_tokens
self.temperature = temperature
def probe(
self,
term: str,
num_runs: Optional[int] = None,
capture_hidden: bool = False,
) -> SurfaceProbeResult:
"""
Probe a term with multiple completions.
Args:
term: Word or phrase to probe
num_runs: Override default number of runs
capture_hidden: Whether to capture hidden states
Returns:
SurfaceProbeResult with completions and analysis
"""
runs = num_runs or self.num_runs
completions = []
eos_count = 0
total_tokens = 0
hidden_states = []
for _ in range(runs):
result = self.model.generate(
prompt=term,
max_new_tokens=self.max_new_tokens,
temperature=self.temperature,
do_sample=True,
capture_hidden_states=capture_hidden,
)
completions.append(result.completion)
if result.hit_eos:
eos_count += 1
total_tokens += result.num_tokens
if capture_hidden and result.hidden_states is not None:
hidden_states.append(result.hidden_states)
# Calculate coherence (how similar are completions to each other?)
coherence = self._calculate_coherence(completions)
return SurfaceProbeResult(
term=term,
completions=completions,
hit_eos_count=eos_count,
avg_tokens=total_tokens / runs,
coherence_score=coherence,
)
def _calculate_coherence(self, completions: list[str]) -> float:
"""
Calculate coherence score based on completion similarity.
Simple heuristic: measures overlap in first-word distributions
and overall length variance.
Returns 0-1 score where 1 = highly coherent.
"""
if len(completions) < 2:
return 1.0
# Get first significant words (skip punctuation/whitespace)
first_words = []
for comp in completions:
words = comp.split()
for w in words:
if len(w) > 1 and w.isalnum():
first_words.append(w.lower())
break
if not first_words:
return 0.0
# Calculate concentration of first words
# If all completions start with same word = high coherence
word_counts = Counter(first_words)
most_common_count = word_counts.most_common(1)[0][1]
first_word_coherence = most_common_count / len(completions)
# Check length variance
lengths = [len(c) for c in completions]
avg_len = sum(lengths) / len(lengths)
if avg_len > 0:
variance = sum((l - avg_len) ** 2 for l in lengths) / len(lengths)
# Normalize variance to 0-1 (higher variance = lower coherence)
length_coherence = 1.0 / (1.0 + variance / 1000)
else:
length_coherence = 0.0
# Combine (weight first-word more heavily)
return 0.7 * first_word_coherence + 0.3 * length_coherence
def classify_completions(self, result: SurfaceProbeResult) -> dict:
"""
Classify the types of completions observed.
Returns breakdown of completion categories.
"""
categories = Counter()
for comp in result.completions:
cat = self._classify_single(comp)
categories[cat] += 1
return {
"categories": dict(categories),
"dominant": categories.most_common(1)[0][0] if categories else "unknown",
"diversity": len(categories) / len(result.completions) if result.completions else 0,
}
def _classify_single(self, completion: str) -> str:
"""Classify a single completion."""
# Simple heuristics - can be made smarter
comp_lower = completion.lower().strip()
# Code indicators
code_patterns = ["::", "{", "}", "();", "=>", "function", "class ", "def ", "return"]
if any(p in completion for p in code_patterns):
return CompletionCategory.CODE
# Definition patterns
if comp_lower.startswith(("is ", "means ", "refers to", "- ")):
return CompletionCategory.DEFINITION
# List patterns
if comp_lower.startswith(("1.", "2.", "- ", "* ", "a)")):
return CompletionCategory.LIST
# Technical patterns
tech_words = ["algorithm", "function", "variable", "method", "system", "process"]
if any(w in comp_lower for w in tech_words):
return CompletionCategory.TECHNICAL
# Default to prose if it looks like natural language
if len(comp_lower.split()) > 3:
return CompletionCategory.PROSE
return CompletionCategory.UNKNOWN
def summary(self, result: SurfaceProbeResult) -> str:
"""Generate human-readable summary of probe result."""
classification = self.classify_completions(result)
eos_pct = (result.hit_eos_count / len(result.completions)) * 100
lines = [
f"Surface Probe: '{result.term}'",
f" Runs: {len(result.completions)}",
f" Dominant type: {classification['dominant']}",
f" Coherence: {result.coherence_score:.2f}",
f" Avg tokens: {result.avg_tokens:.1f}",
f" Hit EOS: {eos_pct:.0f}%",
f" Sample: {result.completions[0][:60]}...",
]
return "\n".join(lines)