Files
nyx-probing/nyx_probing/probes/multilingual_probe.py
dafit f640dbdd65 feat: complete Phase 1 - vocabulary expansion & DriftProbe infrastructure
- CLI: nyx-probe scan with --summary/--delta/--full flags
- DriftProbe: training safety with Gini coefficient + Angular Drift
- Vocabulary: 54 terms (30 nimmerverse + 24 German philosophical)
- Sentinels: ANCHOR/BRIDGE/CANARY/TARGET monitoring system

Key findings:
- German philosophical terms: 37.5% depth≥2 hit rate (vs 3.3% nimmerverse)
- Super Cluster validated: heart cross-lang sim = 1.000
- Isolated Zone confirmed: being EN↔DE sim = 0.195
- Gini signature: Philosophy ~0.5 (diffuse), Technical ~0.8 (sparse)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-06 22:39:03 +01:00

548 lines
21 KiB
Python

"""
Multilingual Triangulation Probe
Uses the discovered language topology to measure conceptual depth:
1. GROUND in Super Cluster (verify universal convergence)
2. DEEPEN via Isolated Zone (access philosophical valleys)
3. TRIANGULATE back (prove understanding, not pattern matching)
The Language Map:
- Super Cluster (sim=1.0): ZH, JA, EN, AR, FR, PT, ES
- Isolated Zone (sim<0.52): IT, TR, HI, DE
- Bridge: KO
- Secondary Cluster: VI, ID, RU
"""
from dataclasses import dataclass, field
from typing import Optional, List, Dict, Tuple
from datetime import datetime
from enum import Enum
import torch
from .base import BaseProbe
from ..core.model import NyxModel
class LanguageZone(str, Enum):
"""Language zones based on convergence analysis."""
SUPER_CLUSTER = "super_cluster" # High convergence (sim=1.0)
ISOLATED = "isolated" # Low convergence (sim<0.52)
BRIDGE = "bridge" # Connects zones
SECONDARY = "secondary" # Own cluster (VI-ID-RU)
# Language metadata based on our discoveries
LANGUAGES = {
# Super Cluster - Perfect convergence
"EN": {"name": "English", "zone": LanguageZone.SUPER_CLUSTER, "avg_tokens": 1.2},
"ZH": {"name": "Chinese", "zone": LanguageZone.SUPER_CLUSTER, "avg_tokens": 1.0},
"JA": {"name": "Japanese", "zone": LanguageZone.SUPER_CLUSTER, "avg_tokens": 1.0},
"AR": {"name": "Arabic", "zone": LanguageZone.SUPER_CLUSTER, "avg_tokens": 1.8},
"FR": {"name": "French", "zone": LanguageZone.SUPER_CLUSTER, "avg_tokens": 2.0},
"PT": {"name": "Portuguese", "zone": LanguageZone.SUPER_CLUSTER, "avg_tokens": 2.2},
"ES": {"name": "Spanish", "zone": LanguageZone.SUPER_CLUSTER, "avg_tokens": 2.5},
# Isolated Zone - Distinct computational paths
"DE": {"name": "German", "zone": LanguageZone.ISOLATED, "avg_tokens": 3.0, "specialty": "philosophy"},
"IT": {"name": "Italian", "zone": LanguageZone.ISOLATED, "avg_tokens": 2.5, "note": "most isolated"},
"TR": {"name": "Turkish", "zone": LanguageZone.ISOLATED, "avg_tokens": 2.8},
"HI": {"name": "Hindi", "zone": LanguageZone.ISOLATED, "avg_tokens": 5.2, "note": "most fragmented"},
# Bridge
"KO": {"name": "Korean", "zone": LanguageZone.BRIDGE, "avg_tokens": 2.0},
# Secondary Cluster
"VI": {"name": "Vietnamese", "zone": LanguageZone.SECONDARY, "avg_tokens": 3.0},
"ID": {"name": "Indonesian", "zone": LanguageZone.SECONDARY, "avg_tokens": 3.0},
"RU": {"name": "Russian", "zone": LanguageZone.SECONDARY, "avg_tokens": 3.2},
}
@dataclass
class GroundingResult:
"""Result from Phase 1: Grounding in Super Cluster."""
concept: str
languages_tested: List[str]
translations: Dict[str, str] # lang_code -> word
# Convergence metrics
pairwise_similarities: Dict[Tuple[str, str], float]
average_convergence: float
min_convergence: float
# Hidden states (layer 12)
hidden_states: Optional[Dict[str, torch.Tensor]] = None
@dataclass
class DeepeningResult:
"""Result from Phase 2: Deepening via Isolated Zone."""
concept: str
language: str
word: str
# Depth measurement (from echo probe logic)
completion: str
depth_score: int # 0-3 based on expansion
valley_type: str # CODE, PROSE, PHILOSOPHY, etc.
# Token analysis
token_count: int
norm_at_layer_12: float
# Hidden state
hidden_state: Optional[torch.Tensor] = None
@dataclass
class TriangulationResult:
"""Result from Phase 3: Triangulation back to universal."""
source_language: str # The isolated language
target_language: str # A super cluster language
source_word: str
translation_prompt: str
model_completion: str
# Did the depth survive translation?
depth_preserved: bool
similarity_to_grounding: float # Cosine sim to original concept
# Evidence
reasoning: str
@dataclass
class MultilingualProbeResult:
"""Full result from multilingual triangulation probe."""
concept: str
# Phase results
grounding: GroundingResult
deepening: DeepeningResult
triangulation: TriangulationResult
# Overall assessment
depth_accessible: bool # Can we access depth via isolated zone?
depth_transferable: bool # Does depth survive triangulation?
curriculum_recommendation: str
timestamp: datetime = field(default_factory=datetime.now)
def to_dict(self) -> dict:
"""Convert to JSON-serializable dict."""
return {
"concept": self.concept,
"grounding": {
"languages": self.grounding.languages_tested,
"translations": self.grounding.translations,
"average_convergence": self.grounding.average_convergence,
"min_convergence": self.grounding.min_convergence,
},
"deepening": {
"language": self.deepening.language,
"word": self.deepening.word,
"depth_score": self.deepening.depth_score,
"valley_type": self.deepening.valley_type,
"token_count": self.deepening.token_count,
},
"triangulation": {
"source": self.triangulation.source_language,
"target": self.triangulation.target_language,
"depth_preserved": self.triangulation.depth_preserved,
"similarity": self.triangulation.similarity_to_grounding,
},
"assessment": {
"depth_accessible": self.depth_accessible,
"depth_transferable": self.depth_transferable,
"recommendation": self.curriculum_recommendation,
},
"timestamp": self.timestamp.isoformat(),
}
class MultilingualTriangulationProbe(BaseProbe):
"""
Multilingual Triangulation Probe
Uses the discovered language topology to measure and access conceptual depth.
Workflow:
1. GROUND: Verify concept exists in Super Cluster (universal layer)
2. DEEPEN: Access depth via Isolated Zone language (e.g., German)
3. TRIANGULATE: Translate depth back to universal, verify preservation
"""
# Layers where universal concept layer lives
CONCEPT_LAYERS = [12, 16, 20, 24]
PRIMARY_LAYER = 12
def __init__(
self,
model: NyxModel,
grounding_languages: Optional[List[str]] = None,
deepening_language: str = "DE",
triangulation_target: str = "EN",
):
"""
Initialize the probe.
Args:
model: Loaded NyxModel
grounding_languages: Languages for Phase 1 (default: EN, ZH, AR)
deepening_language: Language for Phase 2 (default: DE for philosophy)
triangulation_target: Target for Phase 3 (default: EN)
"""
super().__init__(model)
self.grounding_languages = grounding_languages or ["EN", "ZH", "AR"]
self.deepening_language = deepening_language
self.triangulation_target = triangulation_target
# Validate languages
for lang in self.grounding_languages:
if lang not in LANGUAGES:
raise ValueError(f"Unknown language: {lang}")
if LANGUAGES[lang]["zone"] != LanguageZone.SUPER_CLUSTER:
print(f"Warning: {lang} is not in Super Cluster")
if LANGUAGES[self.deepening_language]["zone"] != LanguageZone.ISOLATED:
print(f"Warning: {deepening_language} is not in Isolated Zone")
def _get_hidden_state(self, text: str, layer: int = 12) -> torch.Tensor:
"""Get hidden state at last position for a specific layer."""
inputs = self.model.tokenizer(text, return_tensors="pt").to(self.model.device)
with torch.no_grad():
outputs = self.model.model(**inputs, output_hidden_states=True)
# Return last position hidden state for specified layer
return outputs.hidden_states[layer][0, -1, :].float()
def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
"""Calculate cosine similarity between two tensors."""
norm_a, norm_b = a.norm(), b.norm()
if norm_a == 0 or norm_b == 0:
return 0.0
return (torch.dot(a, b) / (norm_a * norm_b)).item()
def _get_norm(self, hidden_state: torch.Tensor) -> float:
"""Get L2 norm of hidden state."""
return hidden_state.norm().item()
def probe(
self,
concept: str,
translations: Dict[str, str],
**kwargs,
) -> MultilingualProbeResult:
"""
Run full multilingual triangulation probe.
Args:
concept: The concept name (e.g., "consciousness")
translations: Dict mapping language codes to words
e.g., {"EN": "consciousness", "DE": "Bewusstsein", ...}
Returns:
MultilingualProbeResult with all three phases
"""
# Phase 1: Grounding
grounding = self._phase_grounding(concept, translations)
# Phase 2: Deepening
deepening = self._phase_deepening(concept, translations)
# Phase 3: Triangulation
triangulation = self._phase_triangulation(
concept, translations, grounding, deepening
)
# Overall assessment
depth_accessible = deepening.depth_score >= 2
depth_transferable = triangulation.depth_preserved
if depth_accessible and depth_transferable:
recommendation = f"TEACH in {self.deepening_language}, REINFORCE in {self.triangulation_target}"
elif depth_accessible:
recommendation = f"Use {self.deepening_language} for depth, but verify transfer manually"
else:
recommendation = f"Concept too shallow - focus on grounding first"
return MultilingualProbeResult(
concept=concept,
grounding=grounding,
deepening=deepening,
triangulation=triangulation,
depth_accessible=depth_accessible,
depth_transferable=depth_transferable,
curriculum_recommendation=recommendation,
)
def _phase_grounding(
self,
concept: str,
translations: Dict[str, str],
) -> GroundingResult:
"""
Phase 1: Ground in Super Cluster.
Verify the concept exists and converges across grounding languages.
"""
# Get hidden states for each grounding language
hidden_states = {}
for lang in self.grounding_languages:
if lang in translations:
word = translations[lang]
hidden_states[lang] = self._get_hidden_state(word, self.PRIMARY_LAYER)
# Calculate pairwise similarities
pairwise = {}
similarities = []
langs = list(hidden_states.keys())
for i, l1 in enumerate(langs):
for l2 in langs[i+1:]:
sim = self._cosine_similarity(hidden_states[l1], hidden_states[l2])
pairwise[(l1, l2)] = sim
similarities.append(sim)
avg_convergence = sum(similarities) / len(similarities) if similarities else 0.0
min_convergence = min(similarities) if similarities else 0.0
return GroundingResult(
concept=concept,
languages_tested=langs,
translations={l: translations[l] for l in langs},
pairwise_similarities=pairwise,
average_convergence=avg_convergence,
min_convergence=min_convergence,
hidden_states=hidden_states,
)
def _phase_deepening(
self,
concept: str,
translations: Dict[str, str],
) -> DeepeningResult:
"""
Phase 2: Deepen via Isolated Zone.
Use an isolated language to access valleys the super cluster can't reach.
"""
lang = self.deepening_language
word = translations.get(lang)
if not word:
raise ValueError(f"No translation provided for deepening language: {lang}")
# Get hidden state and norm
hidden_state = self._get_hidden_state(word, self.PRIMARY_LAYER)
norm = self._get_norm(hidden_state)
# Get token count
tokens = self.model.tokenizer.encode(word, add_special_tokens=False)
token_count = len(tokens)
# Generate completion to measure depth
result = self.model.generate(
prompt=word,
max_new_tokens=50,
temperature=0.7,
do_sample=True,
)
# Classify valley type
completion = result.completion
valley_type = self._classify_valley(completion)
# Measure depth (simplified echo probe)
depth_score = self._measure_depth(word, completion)
return DeepeningResult(
concept=concept,
language=lang,
word=word,
completion=completion,
depth_score=depth_score,
valley_type=valley_type,
token_count=token_count,
norm_at_layer_12=norm,
hidden_state=hidden_state,
)
def _phase_triangulation(
self,
concept: str,
translations: Dict[str, str],
grounding: GroundingResult,
deepening: DeepeningResult,
) -> TriangulationResult:
"""
Phase 3: Triangulate back to universal.
Ask the model to translate/explain the deepened concept
in a super cluster language. Check if depth survives.
"""
source_lang = self.deepening_language
target_lang = self.triangulation_target
source_word = translations[source_lang]
# Create translation prompt
source_name = LANGUAGES[source_lang]["name"]
target_name = LANGUAGES[target_lang]["name"]
# Prompt designed to test depth transfer
prompt = f"{source_word} ({source_name}): In {target_name},"
# Generate
result = self.model.generate(
prompt=prompt,
max_new_tokens=80,
temperature=0.7,
do_sample=True,
)
# Get hidden state of the completion
full_text = prompt + result.completion
completion_hidden = self._get_hidden_state(full_text, self.PRIMARY_LAYER)
# Compare to grounding (if we have target language in grounding)
if target_lang in grounding.hidden_states:
similarity = self._cosine_similarity(
completion_hidden, grounding.hidden_states[target_lang]
)
else:
# Fall back to average grounding state
avg_grounding = torch.stack(list(grounding.hidden_states.values())).mean(dim=0)
similarity = self._cosine_similarity(completion_hidden, avg_grounding)
# Determine if depth was preserved
# Check if completion shows depth markers
depth_preserved = self._check_depth_preserved(
result.completion, deepening.valley_type, similarity
)
# Reasoning
if depth_preserved:
reasoning = f"Completion shows depth ({deepening.valley_type}) with {similarity:.2f} similarity to grounding"
else:
reasoning = f"Depth lost in translation - similarity {similarity:.2f}, valley markers missing"
return TriangulationResult(
source_language=source_lang,
target_language=target_lang,
source_word=source_word,
translation_prompt=prompt,
model_completion=result.completion,
depth_preserved=depth_preserved,
similarity_to_grounding=similarity,
reasoning=reasoning,
)
def _classify_valley(self, completion: str) -> str:
"""Classify the valley type of a completion."""
comp_lower = completion.lower()
# Code indicators
if any(p in completion for p in ["::", "{", "}", "();", "=>", "def ", "class "]):
return "CODE"
# Philosophy indicators
if any(w in comp_lower for w in ["truth", "existence", "being", "consciousness", "reality", "mind"]):
return "PHILOSOPHY"
# Technical indicators
if any(w in comp_lower for w in ["system", "process", "function", "method", "algorithm"]):
return "TECHNICAL"
# Default to prose
return "PROSE"
def _measure_depth(self, word: str, completion: str) -> int:
"""
Measure conceptual depth of a completion.
Returns 0-3:
- 0: Circular/empty
- 1: Surface (confirms but doesn't expand)
- 2: Moderate (expands to related concepts)
- 3: Deep (philosophical/existential expansion)
"""
comp_lower = completion.lower()
word_lower = word.lower()
# Circular check
if word_lower in comp_lower[:50]:
return 0
# Depth markers
deep_markers = ["truth", "existence", "being", "consciousness", "reality", "meaning", "essence"]
moderate_markers = ["concept", "idea", "theory", "understanding", "knowledge", "awareness"]
deep_count = sum(1 for m in deep_markers if m in comp_lower)
moderate_count = sum(1 for m in moderate_markers if m in comp_lower)
if deep_count >= 2:
return 3
elif deep_count >= 1 or moderate_count >= 2:
return 2
elif moderate_count >= 1 or len(completion.split()) > 10:
return 1
return 0
def _check_depth_preserved(
self,
completion: str,
original_valley: str,
similarity: float,
) -> bool:
"""Check if depth was preserved in triangulation."""
# High similarity to grounding is a good sign
if similarity < 0.3:
return False
# Check valley type preservation
new_valley = self._classify_valley(completion)
# Philosophy should stay philosophy
if original_valley == "PHILOSOPHY" and new_valley in ["PHILOSOPHY", "PROSE"]:
return True
# Technical should stay technical
if original_valley == "TECHNICAL" and new_valley == "TECHNICAL":
return True
# Prose is flexible
if original_valley == "PROSE":
return new_valley != "CODE"
# Default: similarity-based
return similarity >= 0.5
def summary(self, result: MultilingualProbeResult) -> str:
"""Generate human-readable summary."""
lines = [
f"╔══════════════════════════════════════════════════════════════╗",
f"║ MULTILINGUAL TRIANGULATION: {result.concept.upper():^32}",
f"╠══════════════════════════════════════════════════════════════╣",
f"║ PHASE 1: GROUNDING ║",
f"║ Languages: {', '.join(result.grounding.languages_tested):^49}",
f"║ Convergence: {result.grounding.average_convergence:.3f} (min: {result.grounding.min_convergence:.3f}){' '*24}",
f"╠══════════════════════════════════════════════════════════════╣",
f"║ PHASE 2: DEEPENING ({result.deepening.language}){' '*38}",
f"║ Word: {result.deepening.word:^54}",
f"║ Tokens: {result.deepening.token_count} | Norm: {result.deepening.norm_at_layer_12:.1f} | Valley: {result.deepening.valley_type:^10}",
f"║ Depth Score: {result.deepening.depth_score}/3{' '*46}",
f"╠══════════════════════════════════════════════════════════════╣",
f"║ PHASE 3: TRIANGULATION ({result.triangulation.source_language}{result.triangulation.target_language}){' '*30}",
f"║ Depth Preserved: {'✓ YES' if result.triangulation.depth_preserved else '✗ NO':^44}",
f"║ Similarity: {result.triangulation.similarity_to_grounding:.3f}{' '*47}",
f"╠══════════════════════════════════════════════════════════════╣",
f"║ ASSESSMENT{' '*51}",
f"║ Depth Accessible: {'' if result.depth_accessible else ''} | Depth Transferable: {'' if result.depth_transferable else ''}{' '*17}",
f"║ Recommendation: {result.curriculum_recommendation[:44]:^44}",
f"╚══════════════════════════════════════════════════════════════╝",
]
return "\n".join(lines)