feat: complete Phase 1 - vocabulary expansion & DriftProbe infrastructure

- CLI: nyx-probe scan with --summary/--delta/--full flags
- DriftProbe: training safety with Gini coefficient + Angular Drift
- Vocabulary: 54 terms (30 nimmerverse + 24 German philosophical)
- Sentinels: ANCHOR/BRIDGE/CANARY/TARGET monitoring system

Key findings:
- German philosophical terms: 37.5% depth≥2 hit rate (vs 3.3% nimmerverse)
- Super Cluster validated: heart cross-lang sim = 1.000
- Isolated Zone confirmed: being EN↔DE sim = 0.195
- Gini signature: Philosophy ~0.5 (diffuse), Technical ~0.8 (sparse)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-06 22:39:03 +01:00
parent 9853f4767b
commit f640dbdd65
29 changed files with 6164 additions and 1 deletions

266
nyx_probing/core/model.py Normal file
View File

@@ -0,0 +1,266 @@
"""
Core Model Loader for nyx-probing.
Provides access to Qwen2.5-7B-Base with hidden state capture.
The model is an "empty vessel" - it completes, not answers.
"""
from dataclasses import dataclass, field
from typing import Optional, List, Tuple
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
@dataclass
class GenerationResult:
"""Result from a generation with hidden states."""
# The generated text (including prompt)
text: str
# Just the completion (without prompt)
completion: str
# Token IDs of the full sequence
token_ids: List[int]
# Token IDs of just the completion
completion_token_ids: List[int]
# Hidden states from the last layer for each generated token
# Shape: (num_generated_tokens, hidden_dim)
hidden_states: Optional[torch.Tensor] = None
# Token probabilities for each generated token
# Shape: (num_generated_tokens,)
token_probs: Optional[torch.Tensor] = None
# Whether generation ended with EOS
hit_eos: bool = False
# Number of tokens generated
num_tokens: int = 0
class NyxModel:
"""
Model wrapper for probing Qwen2.5-7B-Base.
Key capabilities:
- Hidden state capture during generation
- Token probability extraction
- Proper handling of base model (no chat template)
"""
def __init__(
self,
model_name: str = "Qwen/Qwen2.5-7B",
device: str = "cuda",
dtype: str = "float16",
cache_dir: Optional[str] = None,
):
self.model_name = model_name
self.device = device
self.dtype = getattr(torch, dtype)
self.cache_dir = cache_dir
self._model = None
self._tokenizer = None
self._loaded = False
def load(self) -> "NyxModel":
"""Load the model and tokenizer."""
if self._loaded:
return self
print(f"Loading tokenizer: {self.model_name}")
self._tokenizer = AutoTokenizer.from_pretrained(
self.model_name,
cache_dir=self.cache_dir,
)
print(f"Loading model to {self.device}...")
self._model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=self.dtype,
device_map=self.device,
cache_dir=self.cache_dir,
# Critical for activation capture
output_hidden_states=True,
)
self._loaded = True
print(f"Model loaded. VRAM: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
return self
@property
def model(self):
if not self._loaded:
raise RuntimeError("Model not loaded. Call load() first.")
return self._model
@property
def tokenizer(self):
if not self._loaded:
raise RuntimeError("Model not loaded. Call load() first.")
return self._tokenizer
def generate(
self,
prompt: str,
max_new_tokens: int = 50,
temperature: float = 0.8,
do_sample: bool = True,
capture_hidden_states: bool = False,
capture_probabilities: bool = False,
) -> GenerationResult:
"""
Generate completion with optional hidden state capture.
Args:
prompt: Input text to complete
max_new_tokens: Maximum tokens to generate
temperature: Sampling temperature (0 = greedy)
do_sample: Whether to sample (False = greedy)
capture_hidden_states: Store hidden states from last layer
capture_probabilities: Store token probabilities
Returns:
GenerationResult with text, tokens, and optionally hidden states
"""
# Tokenize input
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
prompt_length = inputs.input_ids.shape[1]
# Generation config
gen_config = GenerationConfig(
max_new_tokens=max_new_tokens,
temperature=temperature if do_sample else 1.0,
do_sample=do_sample,
pad_token_id=self.tokenizer.eos_token_id,
eos_token_id=self.tokenizer.eos_token_id,
output_hidden_states=capture_hidden_states,
output_scores=capture_probabilities,
return_dict_in_generate=True,
)
# Generate
with torch.no_grad():
outputs = self.model.generate(
**inputs,
generation_config=gen_config,
)
# Extract sequences
full_ids = outputs.sequences[0].tolist()
completion_ids = full_ids[prompt_length:]
# Decode
full_text = self.tokenizer.decode(full_ids)
completion_text = self.tokenizer.decode(completion_ids)
# Check if hit EOS
hit_eos = (
len(completion_ids) > 0 and
completion_ids[-1] == self.tokenizer.eos_token_id
)
# Build result
result = GenerationResult(
text=full_text,
completion=completion_text,
token_ids=full_ids,
completion_token_ids=completion_ids,
hit_eos=hit_eos,
num_tokens=len(completion_ids),
)
# Extract hidden states if requested
if capture_hidden_states and hasattr(outputs, 'hidden_states'):
# hidden_states is tuple of (step, layer, batch, seq, hidden)
# We want last layer hidden state for each generated token
hidden_list = []
for step_states in outputs.hidden_states:
# step_states is tuple of layers
# Take last layer, batch 0, last position
last_layer = step_states[-1] # (batch, seq, hidden)
hidden_list.append(last_layer[0, -1, :]) # (hidden,)
result.hidden_states = torch.stack(hidden_list) # (tokens, hidden)
# Extract probabilities if requested
if capture_probabilities and hasattr(outputs, 'scores'):
# scores is tuple of (num_tokens,) each (batch, vocab)
probs_list = []
for i, score in enumerate(outputs.scores):
# Apply softmax to get probabilities
probs = torch.softmax(score[0], dim=-1)
# Get probability of the token that was actually chosen
chosen_token = completion_ids[i]
probs_list.append(probs[chosen_token].item())
result.token_probs = torch.tensor(probs_list)
return result
def get_token_probabilities(
self,
prompt: str,
continuation: str,
) -> Tuple[List[float], List[str]]:
"""
Get probability of each token in a specific continuation.
Useful for measuring how "expected" a completion is.
Args:
prompt: The input text
continuation: The text that follows
Returns:
Tuple of (probabilities, token_strings)
"""
# Tokenize prompt and full sequence
prompt_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
full_text = prompt + continuation
full_ids = self.tokenizer.encode(full_text, return_tensors="pt").to(self.device)
prompt_len = prompt_ids.shape[1]
# Forward pass to get logits
with torch.no_grad():
outputs = self.model(full_ids)
logits = outputs.logits # (batch, seq, vocab)
# Get probabilities for continuation tokens
probs = []
tokens = []
for i in range(prompt_len, full_ids.shape[1]):
# Logits at position i-1 predict token at position i
token_logits = logits[0, i - 1, :]
token_probs = torch.softmax(token_logits, dim=-1)
actual_token = full_ids[0, i].item()
prob = token_probs[actual_token].item()
probs.append(prob)
tokens.append(self.tokenizer.decode([actual_token]))
return probs, tokens
def tokenize(self, text: str) -> List[str]:
"""Get individual tokens for text."""
ids = self.tokenizer.encode(text)
return [self.tokenizer.decode([id]) for id in ids]
def token_count(self, text: str) -> int:
"""Count tokens in text."""
return len(self.tokenizer.encode(text))
def memory_usage(self) -> dict:
"""Get current GPU memory usage."""
return {
"allocated_gb": torch.cuda.memory_allocated() / 1024**3,
"reserved_gb": torch.cuda.memory_reserved() / 1024**3,
"max_allocated_gb": torch.cuda.max_memory_allocated() / 1024**3,
}