nimmersky/oghma-proxy/src/oghma_proxy/extractor.py

"""NPC Profile Extractor - Parses SkyrimNet prompts to extract NPC context."""

from __future__ import annotations

import re

import structlog

from .models import NPCProfile

logger = structlog.get_logger()


class NPCExtractor:
    """Extracts NPC profile information from SkyrimNet prompts."""

    # Regex patterns for extraction
    PATTERNS = {
        # Character bio header
        "bio_header": re.compile(
            r"## (?P<name>[\w\s'-]+) Bio\s*\n"
            r"- Gender: (?P<gender>\w+)\s*\n"
            r"- Race: (?P<race>[\w\s]+)",
            re.MULTILINE,
        ),
        # Alternative role description
        "role_intro": re.compile(
            r"You are (?P<name>[^,\n]+),?\s*(?:a |an )?(?P<descriptor>[^.\n]+)",
            re.IGNORECASE,
        ),
        # Faction membership
        "faction": re.compile(
            r"(?:member of|belongs to|joined|part of) (?:the )?(?P<faction>[\w\s]+?)(?:\.|,|\n|$)",
            re.IGNORECASE,
        ),
        # Location mentions
        "location": re.compile(
            r"(?:in|at|near|from) (?P<location>Whiterun|Windhelm|Solitude|Riften|"
            r"Markarth|Morthal|Dawnstar|Winterhold|Falkreath|Riverwood|Rorikstead|"
            r"Ivarstead|Solstheim|Raven Rock)",
            re.IGNORECASE,
        ),
        # Profession/occupation
        "occupation": re.compile(
            r"(?:works as|profession:|occupation:|is a|as a) (?P<profession>[\w\s]+?)(?:\.|,|\n|$)",
            re.IGNORECASE,
        ),
    }

    # Known professions for fuzzy matching
    KNOWN_PROFESSIONS = {
        "priest", "priestess", "mage", "wizard", "scholar", "blacksmith",
        "guard", "soldier", "warrior", "thief", "merchant", "innkeeper",
        "hunter", "farmer", "peasant", "noble", "jarl", "bard", "alchemist",
        "healer", "assassin", "spy", "courier", "carriage driver", "fisherman",
        "miller", "brewer", "smith", "armorer", "fletcher", "jeweler",
    }

    def extract(self, messages: list[dict]) -> NPCProfile:
        """Extract NPC profile from chat messages."""
        # Combine all message content for analysis
        full_text = "\n".join(
            msg.get("content", "") for msg in messages if msg.get("content")
        )

        profile = NPCProfile()

        # Try bio header first (most reliable)
        if match := self.PATTERNS["bio_header"].search(full_text):
            profile.name = match.group("name").strip()
            profile.gender = match.group("gender").strip()
            profile.race = match.group("race").strip()
            logger.debug("Extracted from bio header", name=profile.name, race=profile.race)

        # Fallback to role intro
        elif match := self.PATTERNS["role_intro"].search(full_text):
            profile.name = match.group("name").strip()
            descriptor = match.group("descriptor")
            # Try to parse race from descriptor
            profile.race = self._extract_race_from_descriptor(descriptor)
            logger.debug("Extracted from role intro", name=profile.name)

        # Extract location
        if match := self.PATTERNS["location"].search(full_text):
            profile.location = match.group("location").strip()

        # Extract factions
        for match in self.PATTERNS["faction"].finditer(full_text):
            faction = match.group("faction").strip()
            if faction and faction not in profile.factions:
                profile.factions.append(faction)

        # Extract profession
        if match := self.PATTERNS["occupation"].search(full_text):
            profession = match.group("profession").strip().lower()
            # Validate against known professions
            for known in self.KNOWN_PROFESSIONS:
                if known in profession:
                    profile.profession = known
                    break

        # Compute knowledge classes
        profile.compute_knowledge_classes()

        logger.info(
            "Extracted NPC profile",
            name=profile.name,
            race=profile.race,
            profession=profile.profession,
            factions=profile.factions,
            location=profile.location,
            knowledge_classes=profile.knowledge_classes,
            education_level=profile.education_level.value,
        )

        return profile

    def _extract_race_from_descriptor(self, descriptor: str) -> str:
        """Try to extract race from a descriptor string."""
        races = [
            "Nord", "Dunmer", "Dark Elf", "Altmer", "High Elf",
            "Bosmer", "Wood Elf", "Argonian", "Khajiit", "Breton",
            "Redguard", "Orsimer", "Orc", "Imperial",
        ]
        descriptor_lower = descriptor.lower()
        for race in races:
            if race.lower() in descriptor_lower:
                # Normalize to single-word form
                return race.replace(" ", "")
        return "Unknown"

    def extract_conversation_context(self, messages: list[dict]) -> str:
        """Extract the current conversation topic for RAG query."""
        # Get the last few user/assistant exchanges
        recent_content = []
        for msg in reversed(messages[-6:]):
            content = msg.get("content", "")
            if content and msg.get("role") in ("user", "assistant"):
                # Skip very long content (likely system prompts)
                if len(content) < 500:
                    recent_content.append(content)

        if not recent_content:
            return ""

        # Combine recent conversation as the query context
        return " ".join(reversed(recent_content[-3:]))