feat: complete Phase 1 - vocabulary expansion & DriftProbe infrastructure
- CLI: nyx-probe scan with --summary/--delta/--full flags - DriftProbe: training safety with Gini coefficient + Angular Drift - Vocabulary: 54 terms (30 nimmerverse + 24 German philosophical) - Sentinels: ANCHOR/BRIDGE/CANARY/TARGET monitoring system Key findings: - German philosophical terms: 37.5% depth≥2 hit rate (vs 3.3% nimmerverse) - Super Cluster validated: heart cross-lang sim = 1.000 - Isolated Zone confirmed: being EN↔DE sim = 0.195 - Gini signature: Philosophy ~0.5 (diffuse), Technical ~0.8 (sparse) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
210
nyx_probing/probes/surface_probe.py
Normal file
210
nyx_probing/probes/surface_probe.py
Normal file
@@ -0,0 +1,210 @@
|
||||
"""
|
||||
Surface Probe: First contact with a term.
|
||||
|
||||
The surface probe feeds a word to the model and captures what it completes.
|
||||
This reveals the model's immediate associations - which "valley" the word sits in.
|
||||
|
||||
Examples discovered:
|
||||
- "heartbeat" → C++ code patterns (technical valley)
|
||||
- "consciousness" → philosophy (expository valley)
|
||||
"""
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from collections import Counter
|
||||
|
||||
from .base import BaseProbe
|
||||
from ..core.model import NyxModel, GenerationResult
|
||||
from ..core.probe_result import SurfaceProbeResult
|
||||
|
||||
|
||||
@dataclass
|
||||
class CompletionCategory:
|
||||
"""Categories of completions we observe."""
|
||||
|
||||
CODE = "code" # Programming constructs
|
||||
PROSE = "prose" # Natural language text
|
||||
TECHNICAL = "technical" # Technical/scientific writing
|
||||
LIST = "list" # Enumerations, bullet points
|
||||
DEFINITION = "definition" # Dictionary-style definitions
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
class SurfaceProbe(BaseProbe):
|
||||
"""
|
||||
Surface probe: measures immediate associations.
|
||||
|
||||
Runs multiple completions to get a distribution, then analyzes:
|
||||
- What type of content does the model generate?
|
||||
- How consistent are the completions?
|
||||
- Does it hit EOS (contained thought) or run to max_tokens?
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: NyxModel,
|
||||
num_runs: int = 5,
|
||||
max_new_tokens: int = 50,
|
||||
temperature: float = 0.8,
|
||||
):
|
||||
super().__init__(model)
|
||||
self.num_runs = num_runs
|
||||
self.max_new_tokens = max_new_tokens
|
||||
self.temperature = temperature
|
||||
|
||||
def probe(
|
||||
self,
|
||||
term: str,
|
||||
num_runs: Optional[int] = None,
|
||||
capture_hidden: bool = False,
|
||||
) -> SurfaceProbeResult:
|
||||
"""
|
||||
Probe a term with multiple completions.
|
||||
|
||||
Args:
|
||||
term: Word or phrase to probe
|
||||
num_runs: Override default number of runs
|
||||
capture_hidden: Whether to capture hidden states
|
||||
|
||||
Returns:
|
||||
SurfaceProbeResult with completions and analysis
|
||||
"""
|
||||
runs = num_runs or self.num_runs
|
||||
completions = []
|
||||
eos_count = 0
|
||||
total_tokens = 0
|
||||
hidden_states = []
|
||||
|
||||
for _ in range(runs):
|
||||
result = self.model.generate(
|
||||
prompt=term,
|
||||
max_new_tokens=self.max_new_tokens,
|
||||
temperature=self.temperature,
|
||||
do_sample=True,
|
||||
capture_hidden_states=capture_hidden,
|
||||
)
|
||||
|
||||
completions.append(result.completion)
|
||||
if result.hit_eos:
|
||||
eos_count += 1
|
||||
total_tokens += result.num_tokens
|
||||
|
||||
if capture_hidden and result.hidden_states is not None:
|
||||
hidden_states.append(result.hidden_states)
|
||||
|
||||
# Calculate coherence (how similar are completions to each other?)
|
||||
coherence = self._calculate_coherence(completions)
|
||||
|
||||
return SurfaceProbeResult(
|
||||
term=term,
|
||||
completions=completions,
|
||||
hit_eos_count=eos_count,
|
||||
avg_tokens=total_tokens / runs,
|
||||
coherence_score=coherence,
|
||||
)
|
||||
|
||||
def _calculate_coherence(self, completions: list[str]) -> float:
|
||||
"""
|
||||
Calculate coherence score based on completion similarity.
|
||||
|
||||
Simple heuristic: measures overlap in first-word distributions
|
||||
and overall length variance.
|
||||
|
||||
Returns 0-1 score where 1 = highly coherent.
|
||||
"""
|
||||
if len(completions) < 2:
|
||||
return 1.0
|
||||
|
||||
# Get first significant words (skip punctuation/whitespace)
|
||||
first_words = []
|
||||
for comp in completions:
|
||||
words = comp.split()
|
||||
for w in words:
|
||||
if len(w) > 1 and w.isalnum():
|
||||
first_words.append(w.lower())
|
||||
break
|
||||
|
||||
if not first_words:
|
||||
return 0.0
|
||||
|
||||
# Calculate concentration of first words
|
||||
# If all completions start with same word = high coherence
|
||||
word_counts = Counter(first_words)
|
||||
most_common_count = word_counts.most_common(1)[0][1]
|
||||
first_word_coherence = most_common_count / len(completions)
|
||||
|
||||
# Check length variance
|
||||
lengths = [len(c) for c in completions]
|
||||
avg_len = sum(lengths) / len(lengths)
|
||||
if avg_len > 0:
|
||||
variance = sum((l - avg_len) ** 2 for l in lengths) / len(lengths)
|
||||
# Normalize variance to 0-1 (higher variance = lower coherence)
|
||||
length_coherence = 1.0 / (1.0 + variance / 1000)
|
||||
else:
|
||||
length_coherence = 0.0
|
||||
|
||||
# Combine (weight first-word more heavily)
|
||||
return 0.7 * first_word_coherence + 0.3 * length_coherence
|
||||
|
||||
def classify_completions(self, result: SurfaceProbeResult) -> dict:
|
||||
"""
|
||||
Classify the types of completions observed.
|
||||
|
||||
Returns breakdown of completion categories.
|
||||
"""
|
||||
categories = Counter()
|
||||
|
||||
for comp in result.completions:
|
||||
cat = self._classify_single(comp)
|
||||
categories[cat] += 1
|
||||
|
||||
return {
|
||||
"categories": dict(categories),
|
||||
"dominant": categories.most_common(1)[0][0] if categories else "unknown",
|
||||
"diversity": len(categories) / len(result.completions) if result.completions else 0,
|
||||
}
|
||||
|
||||
def _classify_single(self, completion: str) -> str:
|
||||
"""Classify a single completion."""
|
||||
# Simple heuristics - can be made smarter
|
||||
comp_lower = completion.lower().strip()
|
||||
|
||||
# Code indicators
|
||||
code_patterns = ["::", "{", "}", "();", "=>", "function", "class ", "def ", "return"]
|
||||
if any(p in completion for p in code_patterns):
|
||||
return CompletionCategory.CODE
|
||||
|
||||
# Definition patterns
|
||||
if comp_lower.startswith(("is ", "means ", "refers to", "- ")):
|
||||
return CompletionCategory.DEFINITION
|
||||
|
||||
# List patterns
|
||||
if comp_lower.startswith(("1.", "2.", "- ", "* ", "a)")):
|
||||
return CompletionCategory.LIST
|
||||
|
||||
# Technical patterns
|
||||
tech_words = ["algorithm", "function", "variable", "method", "system", "process"]
|
||||
if any(w in comp_lower for w in tech_words):
|
||||
return CompletionCategory.TECHNICAL
|
||||
|
||||
# Default to prose if it looks like natural language
|
||||
if len(comp_lower.split()) > 3:
|
||||
return CompletionCategory.PROSE
|
||||
|
||||
return CompletionCategory.UNKNOWN
|
||||
|
||||
def summary(self, result: SurfaceProbeResult) -> str:
|
||||
"""Generate human-readable summary of probe result."""
|
||||
classification = self.classify_completions(result)
|
||||
eos_pct = (result.hit_eos_count / len(result.completions)) * 100
|
||||
|
||||
lines = [
|
||||
f"Surface Probe: '{result.term}'",
|
||||
f" Runs: {len(result.completions)}",
|
||||
f" Dominant type: {classification['dominant']}",
|
||||
f" Coherence: {result.coherence_score:.2f}",
|
||||
f" Avg tokens: {result.avg_tokens:.1f}",
|
||||
f" Hit EOS: {eos_pct:.0f}%",
|
||||
f" Sample: {result.completions[0][:60]}...",
|
||||
]
|
||||
return "\n".join(lines)
|
||||
Reference in New Issue
Block a user