feat: complete Phase 1 - vocabulary expansion & DriftProbe infrastructure

- CLI: nyx-probe scan with --summary/--delta/--full flags
- DriftProbe: training safety with Gini coefficient + Angular Drift
- Vocabulary: 54 terms (30 nimmerverse + 24 German philosophical)
- Sentinels: ANCHOR/BRIDGE/CANARY/TARGET monitoring system

Key findings:
- German philosophical terms: 37.5% depth≥2 hit rate (vs 3.3% nimmerverse)
- Super Cluster validated: heart cross-lang sim = 1.000
- Isolated Zone confirmed: being EN↔DE sim = 0.195
- Gini signature: Philosophy ~0.5 (diffuse), Technical ~0.8 (sparse)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-06 22:39:03 +01:00
parent 9853f4767b
commit f640dbdd65
29 changed files with 6164 additions and 1 deletions

View File

@@ -0,0 +1,19 @@
"""Core components for nyx-probing."""
from .model import NyxModel, GenerationResult
from .probe_result import (
EchoType,
ReadinessLevel,
SurfaceProbeResult,
EchoProbeResult,
ReadinessResult,
)
__all__ = [
"NyxModel",
"GenerationResult",
"EchoType",
"ReadinessLevel",
"SurfaceProbeResult",
"EchoProbeResult",
"ReadinessResult",
]

266
nyx_probing/core/model.py Normal file
View File

@@ -0,0 +1,266 @@
"""
Core Model Loader for nyx-probing.
Provides access to Qwen2.5-7B-Base with hidden state capture.
The model is an "empty vessel" - it completes, not answers.
"""
from dataclasses import dataclass, field
from typing import Optional, List, Tuple
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
@dataclass
class GenerationResult:
"""Result from a generation with hidden states."""
# The generated text (including prompt)
text: str
# Just the completion (without prompt)
completion: str
# Token IDs of the full sequence
token_ids: List[int]
# Token IDs of just the completion
completion_token_ids: List[int]
# Hidden states from the last layer for each generated token
# Shape: (num_generated_tokens, hidden_dim)
hidden_states: Optional[torch.Tensor] = None
# Token probabilities for each generated token
# Shape: (num_generated_tokens,)
token_probs: Optional[torch.Tensor] = None
# Whether generation ended with EOS
hit_eos: bool = False
# Number of tokens generated
num_tokens: int = 0
class NyxModel:
"""
Model wrapper for probing Qwen2.5-7B-Base.
Key capabilities:
- Hidden state capture during generation
- Token probability extraction
- Proper handling of base model (no chat template)
"""
def __init__(
self,
model_name: str = "Qwen/Qwen2.5-7B",
device: str = "cuda",
dtype: str = "float16",
cache_dir: Optional[str] = None,
):
self.model_name = model_name
self.device = device
self.dtype = getattr(torch, dtype)
self.cache_dir = cache_dir
self._model = None
self._tokenizer = None
self._loaded = False
def load(self) -> "NyxModel":
"""Load the model and tokenizer."""
if self._loaded:
return self
print(f"Loading tokenizer: {self.model_name}")
self._tokenizer = AutoTokenizer.from_pretrained(
self.model_name,
cache_dir=self.cache_dir,
)
print(f"Loading model to {self.device}...")
self._model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=self.dtype,
device_map=self.device,
cache_dir=self.cache_dir,
# Critical for activation capture
output_hidden_states=True,
)
self._loaded = True
print(f"Model loaded. VRAM: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
return self
@property
def model(self):
if not self._loaded:
raise RuntimeError("Model not loaded. Call load() first.")
return self._model
@property
def tokenizer(self):
if not self._loaded:
raise RuntimeError("Model not loaded. Call load() first.")
return self._tokenizer
def generate(
self,
prompt: str,
max_new_tokens: int = 50,
temperature: float = 0.8,
do_sample: bool = True,
capture_hidden_states: bool = False,
capture_probabilities: bool = False,
) -> GenerationResult:
"""
Generate completion with optional hidden state capture.
Args:
prompt: Input text to complete
max_new_tokens: Maximum tokens to generate
temperature: Sampling temperature (0 = greedy)
do_sample: Whether to sample (False = greedy)
capture_hidden_states: Store hidden states from last layer
capture_probabilities: Store token probabilities
Returns:
GenerationResult with text, tokens, and optionally hidden states
"""
# Tokenize input
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
prompt_length = inputs.input_ids.shape[1]
# Generation config
gen_config = GenerationConfig(
max_new_tokens=max_new_tokens,
temperature=temperature if do_sample else 1.0,
do_sample=do_sample,
pad_token_id=self.tokenizer.eos_token_id,
eos_token_id=self.tokenizer.eos_token_id,
output_hidden_states=capture_hidden_states,
output_scores=capture_probabilities,
return_dict_in_generate=True,
)
# Generate
with torch.no_grad():
outputs = self.model.generate(
**inputs,
generation_config=gen_config,
)
# Extract sequences
full_ids = outputs.sequences[0].tolist()
completion_ids = full_ids[prompt_length:]
# Decode
full_text = self.tokenizer.decode(full_ids)
completion_text = self.tokenizer.decode(completion_ids)
# Check if hit EOS
hit_eos = (
len(completion_ids) > 0 and
completion_ids[-1] == self.tokenizer.eos_token_id
)
# Build result
result = GenerationResult(
text=full_text,
completion=completion_text,
token_ids=full_ids,
completion_token_ids=completion_ids,
hit_eos=hit_eos,
num_tokens=len(completion_ids),
)
# Extract hidden states if requested
if capture_hidden_states and hasattr(outputs, 'hidden_states'):
# hidden_states is tuple of (step, layer, batch, seq, hidden)
# We want last layer hidden state for each generated token
hidden_list = []
for step_states in outputs.hidden_states:
# step_states is tuple of layers
# Take last layer, batch 0, last position
last_layer = step_states[-1] # (batch, seq, hidden)
hidden_list.append(last_layer[0, -1, :]) # (hidden,)
result.hidden_states = torch.stack(hidden_list) # (tokens, hidden)
# Extract probabilities if requested
if capture_probabilities and hasattr(outputs, 'scores'):
# scores is tuple of (num_tokens,) each (batch, vocab)
probs_list = []
for i, score in enumerate(outputs.scores):
# Apply softmax to get probabilities
probs = torch.softmax(score[0], dim=-1)
# Get probability of the token that was actually chosen
chosen_token = completion_ids[i]
probs_list.append(probs[chosen_token].item())
result.token_probs = torch.tensor(probs_list)
return result
def get_token_probabilities(
self,
prompt: str,
continuation: str,
) -> Tuple[List[float], List[str]]:
"""
Get probability of each token in a specific continuation.
Useful for measuring how "expected" a completion is.
Args:
prompt: The input text
continuation: The text that follows
Returns:
Tuple of (probabilities, token_strings)
"""
# Tokenize prompt and full sequence
prompt_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
full_text = prompt + continuation
full_ids = self.tokenizer.encode(full_text, return_tensors="pt").to(self.device)
prompt_len = prompt_ids.shape[1]
# Forward pass to get logits
with torch.no_grad():
outputs = self.model(full_ids)
logits = outputs.logits # (batch, seq, vocab)
# Get probabilities for continuation tokens
probs = []
tokens = []
for i in range(prompt_len, full_ids.shape[1]):
# Logits at position i-1 predict token at position i
token_logits = logits[0, i - 1, :]
token_probs = torch.softmax(token_logits, dim=-1)
actual_token = full_ids[0, i].item()
prob = token_probs[actual_token].item()
probs.append(prob)
tokens.append(self.tokenizer.decode([actual_token]))
return probs, tokens
def tokenize(self, text: str) -> List[str]:
"""Get individual tokens for text."""
ids = self.tokenizer.encode(text)
return [self.tokenizer.decode([id]) for id in ids]
def token_count(self, text: str) -> int:
"""Count tokens in text."""
return len(self.tokenizer.encode(text))
def memory_usage(self) -> dict:
"""Get current GPU memory usage."""
return {
"allocated_gb": torch.cuda.memory_allocated() / 1024**3,
"reserved_gb": torch.cuda.memory_reserved() / 1024**3,
"max_allocated_gb": torch.cuda.max_memory_allocated() / 1024**3,
}

View File

@@ -0,0 +1,97 @@
"""
Result dataclasses for probing operations.
These structures capture what we learn about each term.
"""
from dataclasses import dataclass, field
from typing import List, Optional, Literal
from datetime import datetime
from enum import Enum
class EchoType(str, Enum):
"""Classification of echo probe responses."""
EXPANDS = "EXPANDS" # Real depth - adds new information
CONFIRMS = "CONFIRMS" # Shallow but solid - reinforces without adding
CIRCULAR = "CIRCULAR" # Surface only - returns to original term
DIVERGENT = "DIVERGENT" # Wrong direction - unrelated tangent
COLLAPSE = "COLLAPSE" # Nothing there - incoherent or empty
class ReadinessLevel(str, Enum):
"""Readiness classification for curriculum design."""
HIGH = "HIGH" # Ready for state machine / direct training
MEDIUM = "MEDIUM" # Needs scaffolding / bridging concepts
LOW = "LOW" # Requires foundational work first
@dataclass
class SurfaceProbeResult:
"""Result from a surface probe (single word → completions)."""
term: str
completions: List[str]
hit_eos_count: int # How many completions ended with EOS
avg_tokens: float # Average completion length
# Optional analysis
coherence_score: Optional[float] = None # 0-1, how related are completions
timestamp: datetime = field(default_factory=datetime.now)
@dataclass
class EchoProbeResult:
"""Result from an echo probe (iterative depth measurement)."""
term: str
rounds: int
chain: List[str] # The sequence of prompts/completions
echo_types: List[EchoType] # Classification of each round
# Derived metrics
depth: int = 0 # How many EXPANDS before plateau
timestamp: datetime = field(default_factory=datetime.now)
@dataclass
class ReadinessResult:
"""Combined analysis for curriculum readiness."""
term: str
level: ReadinessLevel
action: str # Recommended curriculum action
# Supporting evidence
surface: Optional[SurfaceProbeResult] = None
echo: Optional[EchoProbeResult] = None
# Reasoning
reasoning: str = ""
timestamp: datetime = field(default_factory=datetime.now)
def to_dict(self) -> dict:
"""Convert to JSON-serializable dict."""
return {
"term": self.term,
"readiness": {
"level": self.level.value,
"action": self.action,
"reasoning": self.reasoning,
},
"surface": {
"completions": self.surface.completions if self.surface else [],
"coherence": self.surface.coherence_score if self.surface else None,
"hit_eos_count": self.surface.hit_eos_count if self.surface else 0,
} if self.surface else None,
"echo": {
"depth": self.echo.depth if self.echo else 0,
"types": [t.value for t in self.echo.echo_types] if self.echo else [],
"chain": self.echo.chain if self.echo else [],
} if self.echo else None,
"timestamp": self.timestamp.isoformat(),
}