feat: complete Phase 1 - vocabulary expansion & DriftProbe infrastructure
- CLI: nyx-probe scan with --summary/--delta/--full flags - DriftProbe: training safety with Gini coefficient + Angular Drift - Vocabulary: 54 terms (30 nimmerverse + 24 German philosophical) - Sentinels: ANCHOR/BRIDGE/CANARY/TARGET monitoring system Key findings: - German philosophical terms: 37.5% depth≥2 hit rate (vs 3.3% nimmerverse) - Super Cluster validated: heart cross-lang sim = 1.000 - Isolated Zone confirmed: being EN↔DE sim = 0.195 - Gini signature: Philosophy ~0.5 (diffuse), Technical ~0.8 (sparse) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,19 @@
|
||||
"""Core components for nyx-probing."""
|
||||
from .model import NyxModel, GenerationResult
|
||||
from .probe_result import (
|
||||
EchoType,
|
||||
ReadinessLevel,
|
||||
SurfaceProbeResult,
|
||||
EchoProbeResult,
|
||||
ReadinessResult,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"NyxModel",
|
||||
"GenerationResult",
|
||||
"EchoType",
|
||||
"ReadinessLevel",
|
||||
"SurfaceProbeResult",
|
||||
"EchoProbeResult",
|
||||
"ReadinessResult",
|
||||
]
|
||||
|
||||
266
nyx_probing/core/model.py
Normal file
266
nyx_probing/core/model.py
Normal file
@@ -0,0 +1,266 @@
|
||||
"""
|
||||
Core Model Loader for nyx-probing.
|
||||
|
||||
Provides access to Qwen2.5-7B-Base with hidden state capture.
|
||||
The model is an "empty vessel" - it completes, not answers.
|
||||
"""
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, List, Tuple
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class GenerationResult:
|
||||
"""Result from a generation with hidden states."""
|
||||
|
||||
# The generated text (including prompt)
|
||||
text: str
|
||||
|
||||
# Just the completion (without prompt)
|
||||
completion: str
|
||||
|
||||
# Token IDs of the full sequence
|
||||
token_ids: List[int]
|
||||
|
||||
# Token IDs of just the completion
|
||||
completion_token_ids: List[int]
|
||||
|
||||
# Hidden states from the last layer for each generated token
|
||||
# Shape: (num_generated_tokens, hidden_dim)
|
||||
hidden_states: Optional[torch.Tensor] = None
|
||||
|
||||
# Token probabilities for each generated token
|
||||
# Shape: (num_generated_tokens,)
|
||||
token_probs: Optional[torch.Tensor] = None
|
||||
|
||||
# Whether generation ended with EOS
|
||||
hit_eos: bool = False
|
||||
|
||||
# Number of tokens generated
|
||||
num_tokens: int = 0
|
||||
|
||||
|
||||
class NyxModel:
|
||||
"""
|
||||
Model wrapper for probing Qwen2.5-7B-Base.
|
||||
|
||||
Key capabilities:
|
||||
- Hidden state capture during generation
|
||||
- Token probability extraction
|
||||
- Proper handling of base model (no chat template)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = "Qwen/Qwen2.5-7B",
|
||||
device: str = "cuda",
|
||||
dtype: str = "float16",
|
||||
cache_dir: Optional[str] = None,
|
||||
):
|
||||
self.model_name = model_name
|
||||
self.device = device
|
||||
self.dtype = getattr(torch, dtype)
|
||||
self.cache_dir = cache_dir
|
||||
|
||||
self._model = None
|
||||
self._tokenizer = None
|
||||
self._loaded = False
|
||||
|
||||
def load(self) -> "NyxModel":
|
||||
"""Load the model and tokenizer."""
|
||||
if self._loaded:
|
||||
return self
|
||||
|
||||
print(f"Loading tokenizer: {self.model_name}")
|
||||
self._tokenizer = AutoTokenizer.from_pretrained(
|
||||
self.model_name,
|
||||
cache_dir=self.cache_dir,
|
||||
)
|
||||
|
||||
print(f"Loading model to {self.device}...")
|
||||
self._model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
torch_dtype=self.dtype,
|
||||
device_map=self.device,
|
||||
cache_dir=self.cache_dir,
|
||||
# Critical for activation capture
|
||||
output_hidden_states=True,
|
||||
)
|
||||
|
||||
self._loaded = True
|
||||
print(f"Model loaded. VRAM: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
|
||||
return self
|
||||
|
||||
@property
|
||||
def model(self):
|
||||
if not self._loaded:
|
||||
raise RuntimeError("Model not loaded. Call load() first.")
|
||||
return self._model
|
||||
|
||||
@property
|
||||
def tokenizer(self):
|
||||
if not self._loaded:
|
||||
raise RuntimeError("Model not loaded. Call load() first.")
|
||||
return self._tokenizer
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompt: str,
|
||||
max_new_tokens: int = 50,
|
||||
temperature: float = 0.8,
|
||||
do_sample: bool = True,
|
||||
capture_hidden_states: bool = False,
|
||||
capture_probabilities: bool = False,
|
||||
) -> GenerationResult:
|
||||
"""
|
||||
Generate completion with optional hidden state capture.
|
||||
|
||||
Args:
|
||||
prompt: Input text to complete
|
||||
max_new_tokens: Maximum tokens to generate
|
||||
temperature: Sampling temperature (0 = greedy)
|
||||
do_sample: Whether to sample (False = greedy)
|
||||
capture_hidden_states: Store hidden states from last layer
|
||||
capture_probabilities: Store token probabilities
|
||||
|
||||
Returns:
|
||||
GenerationResult with text, tokens, and optionally hidden states
|
||||
"""
|
||||
# Tokenize input
|
||||
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||
prompt_length = inputs.input_ids.shape[1]
|
||||
|
||||
# Generation config
|
||||
gen_config = GenerationConfig(
|
||||
max_new_tokens=max_new_tokens,
|
||||
temperature=temperature if do_sample else 1.0,
|
||||
do_sample=do_sample,
|
||||
pad_token_id=self.tokenizer.eos_token_id,
|
||||
eos_token_id=self.tokenizer.eos_token_id,
|
||||
output_hidden_states=capture_hidden_states,
|
||||
output_scores=capture_probabilities,
|
||||
return_dict_in_generate=True,
|
||||
)
|
||||
|
||||
# Generate
|
||||
with torch.no_grad():
|
||||
outputs = self.model.generate(
|
||||
**inputs,
|
||||
generation_config=gen_config,
|
||||
)
|
||||
|
||||
# Extract sequences
|
||||
full_ids = outputs.sequences[0].tolist()
|
||||
completion_ids = full_ids[prompt_length:]
|
||||
|
||||
# Decode
|
||||
full_text = self.tokenizer.decode(full_ids)
|
||||
completion_text = self.tokenizer.decode(completion_ids)
|
||||
|
||||
# Check if hit EOS
|
||||
hit_eos = (
|
||||
len(completion_ids) > 0 and
|
||||
completion_ids[-1] == self.tokenizer.eos_token_id
|
||||
)
|
||||
|
||||
# Build result
|
||||
result = GenerationResult(
|
||||
text=full_text,
|
||||
completion=completion_text,
|
||||
token_ids=full_ids,
|
||||
completion_token_ids=completion_ids,
|
||||
hit_eos=hit_eos,
|
||||
num_tokens=len(completion_ids),
|
||||
)
|
||||
|
||||
# Extract hidden states if requested
|
||||
if capture_hidden_states and hasattr(outputs, 'hidden_states'):
|
||||
# hidden_states is tuple of (step, layer, batch, seq, hidden)
|
||||
# We want last layer hidden state for each generated token
|
||||
hidden_list = []
|
||||
for step_states in outputs.hidden_states:
|
||||
# step_states is tuple of layers
|
||||
# Take last layer, batch 0, last position
|
||||
last_layer = step_states[-1] # (batch, seq, hidden)
|
||||
hidden_list.append(last_layer[0, -1, :]) # (hidden,)
|
||||
|
||||
result.hidden_states = torch.stack(hidden_list) # (tokens, hidden)
|
||||
|
||||
# Extract probabilities if requested
|
||||
if capture_probabilities and hasattr(outputs, 'scores'):
|
||||
# scores is tuple of (num_tokens,) each (batch, vocab)
|
||||
probs_list = []
|
||||
for i, score in enumerate(outputs.scores):
|
||||
# Apply softmax to get probabilities
|
||||
probs = torch.softmax(score[0], dim=-1)
|
||||
# Get probability of the token that was actually chosen
|
||||
chosen_token = completion_ids[i]
|
||||
probs_list.append(probs[chosen_token].item())
|
||||
|
||||
result.token_probs = torch.tensor(probs_list)
|
||||
|
||||
return result
|
||||
|
||||
def get_token_probabilities(
|
||||
self,
|
||||
prompt: str,
|
||||
continuation: str,
|
||||
) -> Tuple[List[float], List[str]]:
|
||||
"""
|
||||
Get probability of each token in a specific continuation.
|
||||
|
||||
Useful for measuring how "expected" a completion is.
|
||||
|
||||
Args:
|
||||
prompt: The input text
|
||||
continuation: The text that follows
|
||||
|
||||
Returns:
|
||||
Tuple of (probabilities, token_strings)
|
||||
"""
|
||||
# Tokenize prompt and full sequence
|
||||
prompt_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
|
||||
full_text = prompt + continuation
|
||||
full_ids = self.tokenizer.encode(full_text, return_tensors="pt").to(self.device)
|
||||
|
||||
prompt_len = prompt_ids.shape[1]
|
||||
|
||||
# Forward pass to get logits
|
||||
with torch.no_grad():
|
||||
outputs = self.model(full_ids)
|
||||
logits = outputs.logits # (batch, seq, vocab)
|
||||
|
||||
# Get probabilities for continuation tokens
|
||||
probs = []
|
||||
tokens = []
|
||||
|
||||
for i in range(prompt_len, full_ids.shape[1]):
|
||||
# Logits at position i-1 predict token at position i
|
||||
token_logits = logits[0, i - 1, :]
|
||||
token_probs = torch.softmax(token_logits, dim=-1)
|
||||
|
||||
actual_token = full_ids[0, i].item()
|
||||
prob = token_probs[actual_token].item()
|
||||
|
||||
probs.append(prob)
|
||||
tokens.append(self.tokenizer.decode([actual_token]))
|
||||
|
||||
return probs, tokens
|
||||
|
||||
def tokenize(self, text: str) -> List[str]:
|
||||
"""Get individual tokens for text."""
|
||||
ids = self.tokenizer.encode(text)
|
||||
return [self.tokenizer.decode([id]) for id in ids]
|
||||
|
||||
def token_count(self, text: str) -> int:
|
||||
"""Count tokens in text."""
|
||||
return len(self.tokenizer.encode(text))
|
||||
|
||||
def memory_usage(self) -> dict:
|
||||
"""Get current GPU memory usage."""
|
||||
return {
|
||||
"allocated_gb": torch.cuda.memory_allocated() / 1024**3,
|
||||
"reserved_gb": torch.cuda.memory_reserved() / 1024**3,
|
||||
"max_allocated_gb": torch.cuda.max_memory_allocated() / 1024**3,
|
||||
}
|
||||
97
nyx_probing/core/probe_result.py
Normal file
97
nyx_probing/core/probe_result.py
Normal file
@@ -0,0 +1,97 @@
|
||||
"""
|
||||
Result dataclasses for probing operations.
|
||||
|
||||
These structures capture what we learn about each term.
|
||||
"""
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Literal
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class EchoType(str, Enum):
|
||||
"""Classification of echo probe responses."""
|
||||
|
||||
EXPANDS = "EXPANDS" # Real depth - adds new information
|
||||
CONFIRMS = "CONFIRMS" # Shallow but solid - reinforces without adding
|
||||
CIRCULAR = "CIRCULAR" # Surface only - returns to original term
|
||||
DIVERGENT = "DIVERGENT" # Wrong direction - unrelated tangent
|
||||
COLLAPSE = "COLLAPSE" # Nothing there - incoherent or empty
|
||||
|
||||
|
||||
class ReadinessLevel(str, Enum):
|
||||
"""Readiness classification for curriculum design."""
|
||||
|
||||
HIGH = "HIGH" # Ready for state machine / direct training
|
||||
MEDIUM = "MEDIUM" # Needs scaffolding / bridging concepts
|
||||
LOW = "LOW" # Requires foundational work first
|
||||
|
||||
|
||||
@dataclass
|
||||
class SurfaceProbeResult:
|
||||
"""Result from a surface probe (single word → completions)."""
|
||||
|
||||
term: str
|
||||
completions: List[str]
|
||||
hit_eos_count: int # How many completions ended with EOS
|
||||
avg_tokens: float # Average completion length
|
||||
|
||||
# Optional analysis
|
||||
coherence_score: Optional[float] = None # 0-1, how related are completions
|
||||
|
||||
timestamp: datetime = field(default_factory=datetime.now)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EchoProbeResult:
|
||||
"""Result from an echo probe (iterative depth measurement)."""
|
||||
|
||||
term: str
|
||||
rounds: int
|
||||
chain: List[str] # The sequence of prompts/completions
|
||||
echo_types: List[EchoType] # Classification of each round
|
||||
|
||||
# Derived metrics
|
||||
depth: int = 0 # How many EXPANDS before plateau
|
||||
|
||||
timestamp: datetime = field(default_factory=datetime.now)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReadinessResult:
|
||||
"""Combined analysis for curriculum readiness."""
|
||||
|
||||
term: str
|
||||
level: ReadinessLevel
|
||||
action: str # Recommended curriculum action
|
||||
|
||||
# Supporting evidence
|
||||
surface: Optional[SurfaceProbeResult] = None
|
||||
echo: Optional[EchoProbeResult] = None
|
||||
|
||||
# Reasoning
|
||||
reasoning: str = ""
|
||||
|
||||
timestamp: datetime = field(default_factory=datetime.now)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to JSON-serializable dict."""
|
||||
return {
|
||||
"term": self.term,
|
||||
"readiness": {
|
||||
"level": self.level.value,
|
||||
"action": self.action,
|
||||
"reasoning": self.reasoning,
|
||||
},
|
||||
"surface": {
|
||||
"completions": self.surface.completions if self.surface else [],
|
||||
"coherence": self.surface.coherence_score if self.surface else None,
|
||||
"hit_eos_count": self.surface.hit_eos_count if self.surface else 0,
|
||||
} if self.surface else None,
|
||||
"echo": {
|
||||
"depth": self.echo.depth if self.echo else 0,
|
||||
"types": [t.value for t in self.echo.echo_types] if self.echo else [],
|
||||
"chain": self.echo.chain if self.echo else [],
|
||||
} if self.echo else None,
|
||||
"timestamp": self.timestamp.isoformat(),
|
||||
}
|
||||
Reference in New Issue
Block a user