"""NPC Profile Extractor - Parses SkyrimNet prompts to extract NPC context.""" from __future__ import annotations import re import structlog from .models import NPCProfile logger = structlog.get_logger() class NPCExtractor: """Extracts NPC profile information from SkyrimNet prompts.""" # Regex patterns for extraction PATTERNS = { # Character bio header "bio_header": re.compile( r"## (?P[\w\s'-]+) Bio\s*\n" r"- Gender: (?P\w+)\s*\n" r"- Race: (?P[\w\s]+)", re.MULTILINE, ), # Alternative role description "role_intro": re.compile( r"You are (?P[^,\n]+),?\s*(?:a |an )?(?P[^.\n]+)", re.IGNORECASE, ), # Faction membership "faction": re.compile( r"(?:member of|belongs to|joined|part of) (?:the )?(?P[\w\s]+?)(?:\.|,|\n|$)", re.IGNORECASE, ), # Location mentions "location": re.compile( r"(?:in|at|near|from) (?PWhiterun|Windhelm|Solitude|Riften|" r"Markarth|Morthal|Dawnstar|Winterhold|Falkreath|Riverwood|Rorikstead|" r"Ivarstead|Solstheim|Raven Rock)", re.IGNORECASE, ), # Profession/occupation "occupation": re.compile( r"(?:works as|profession:|occupation:|is a|as a) (?P[\w\s]+?)(?:\.|,|\n|$)", re.IGNORECASE, ), } # Known professions for fuzzy matching KNOWN_PROFESSIONS = { "priest", "priestess", "mage", "wizard", "scholar", "blacksmith", "guard", "soldier", "warrior", "thief", "merchant", "innkeeper", "hunter", "farmer", "peasant", "noble", "jarl", "bard", "alchemist", "healer", "assassin", "spy", "courier", "carriage driver", "fisherman", "miller", "brewer", "smith", "armorer", "fletcher", "jeweler", } def extract(self, messages: list[dict]) -> NPCProfile: """Extract NPC profile from chat messages.""" # Combine all message content for analysis full_text = "\n".join( msg.get("content", "") for msg in messages if msg.get("content") ) profile = NPCProfile() # Try bio header first (most reliable) if match := self.PATTERNS["bio_header"].search(full_text): profile.name = match.group("name").strip() profile.gender = match.group("gender").strip() profile.race = match.group("race").strip() logger.debug("Extracted from bio header", name=profile.name, race=profile.race) # Fallback to role intro elif match := self.PATTERNS["role_intro"].search(full_text): profile.name = match.group("name").strip() descriptor = match.group("descriptor") # Try to parse race from descriptor profile.race = self._extract_race_from_descriptor(descriptor) logger.debug("Extracted from role intro", name=profile.name) # Extract location if match := self.PATTERNS["location"].search(full_text): profile.location = match.group("location").strip() # Extract factions for match in self.PATTERNS["faction"].finditer(full_text): faction = match.group("faction").strip() if faction and faction not in profile.factions: profile.factions.append(faction) # Extract profession if match := self.PATTERNS["occupation"].search(full_text): profession = match.group("profession").strip().lower() # Validate against known professions for known in self.KNOWN_PROFESSIONS: if known in profession: profile.profession = known break # Compute knowledge classes profile.compute_knowledge_classes() logger.info( "Extracted NPC profile", name=profile.name, race=profile.race, profession=profile.profession, factions=profile.factions, location=profile.location, knowledge_classes=profile.knowledge_classes, education_level=profile.education_level.value, ) return profile def _extract_race_from_descriptor(self, descriptor: str) -> str: """Try to extract race from a descriptor string.""" races = [ "Nord", "Dunmer", "Dark Elf", "Altmer", "High Elf", "Bosmer", "Wood Elf", "Argonian", "Khajiit", "Breton", "Redguard", "Orsimer", "Orc", "Imperial", ] descriptor_lower = descriptor.lower() for race in races: if race.lower() in descriptor_lower: # Normalize to single-word form return race.replace(" ", "") return "Unknown" def extract_conversation_context(self, messages: list[dict]) -> str: """Extract the current conversation topic for RAG query.""" # Get the last few user/assistant exchanges recent_content = [] for msg in reversed(messages[-6:]): content = msg.get("content", "") if content and msg.get("role") in ("user", "assistant"): # Skip very long content (likely system prompts) if len(content) < 500: recent_content.append(content) if not recent_content: return "" # Combine recent conversation as the query context return " ".join(reversed(recent_content[-3:]))