From 60e6e18284805621ec64f66560d0708a82fbfc0b Mon Sep 17 00:00:00 2001 From: mruwnik Date: Sun, 21 Dec 2025 13:32:10 +0000 Subject: [PATCH] Add modality detection and family term expansion for search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add useModalityDetection config flag to detect content type hints from natural language queries (e.g., "on lesswrong" → forum, "comic about" → comic) - Strip meta-language noise from queries ("there was something about") - Add family term expansion (father ↔ son, parent ↔ child, etc.) - Modality detection is off by default, configurable per-request TODO: Replace regex-based detection with LLM-based query analysis (Haiku) that can run in parallel with HyDE for better accuracy. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/memory/api/search/search.py | 100 +++++++++++++++++++++++++++++++- src/memory/api/search/types.py | 1 + 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/src/memory/api/search/search.py b/src/memory/api/search/search.py index 33b9a13..1476c3b 100644 --- a/src/memory/api/search/search.py +++ b/src/memory/api/search/search.py @@ -101,8 +101,91 @@ QUERY_EXPANSIONS: dict[str, list[str]] = { "neural network": ["nn"], "effective altruism": ["ea"], "existential risk": ["x-risk", "xrisk"], + # Family relationships (bidirectional) + "father": ["son", "daughter", "child", "parent", "dad"], + "mother": ["son", "daughter", "child", "parent", "mom"], + "parent": ["child", "son", "daughter", "father", "mother"], + "son": ["father", "parent", "child"], + "daughter": ["mother", "parent", "child"], + "child": ["parent", "father", "mother"], + "dad": ["father", "son", "daughter", "child"], + "mom": ["mother", "son", "daughter", "child"], } +# Modality detection patterns: map query phrases to collection names +# Each entry is (pattern, modalities, strip_pattern) +# - pattern: regex to match in query +# - modalities: set of collection names to filter to +# - strip_pattern: whether to remove the matched text from query +MODALITY_PATTERNS: list[tuple[str, set[str], bool]] = [ + # Comics + (r"\b(comic|comics|webcomic|webcomics)\b", {"comic"}, True), + # Forum posts (LessWrong, EA Forum, etc.) + (r"\b(on\s+)?(lesswrong|lw|less\s+wrong)\b", {"forum"}, True), + (r"\b(on\s+)?(ea\s+forum|effective\s+altruism\s+forum)\b", {"forum"}, True), + (r"\b(on\s+)?(alignment\s+forum|af)\b", {"forum"}, True), + (r"\b(forum\s+post|lw\s+post|post\s+on)\b", {"forum"}, True), + # Books + (r"\b(in\s+a\s+book|in\s+the\s+book|book|chapter)\b", {"book"}, True), + # Blog posts / articles + (r"\b(blog\s+post|blog|article)\b", {"blog"}, True), + # Email + (r"\b(email|e-mail|mail)\b", {"mail"}, True), + # Photos / images + (r"\b(photo|photograph|picture|image)\b", {"photo"}, True), + # Documents + (r"\b(document|pdf|doc)\b", {"doc"}, True), + # Chat / messages + (r"\b(chat|message|discord|slack)\b", {"chat"}, True), + # Git + (r"\b(commit|git|pull\s+request|pr)\b", {"git"}, True), +] + +# Meta-language patterns to strip (these don't indicate modality, just noise) +META_LANGUAGE_PATTERNS: list[str] = [ + r"\bthere\s+was\s+(something|some|some\s+\w+|an?\s+\w+)\s+(about|on)\b", + r"\bi\s+remember\s+(reading|seeing|there\s+being)\s*(an?\s+)?", + r"\bi\s+(read|saw|found)\s+(something|an?\s+\w+)\s+about\b", + r"\bsomething\s+about\b", + r"\bsome\s+about\b", + r"\bthis\s+whole\s+\w+\s+thing\b", + r"\bthat\s+\w+\s+thing\b", + r"\bthat\s+about\b", # Clean up leftover "that about" + r"\ba\s+about\b", # Clean up leftover "a about" + r"\bthe\s+about\b", # Clean up leftover "the about" + r"\bthere\s+was\s+some\s+about\b", # Clean up leftover +] + + +def detect_modality_hints(query: str) -> tuple[str, set[str]]: + """ + Detect content type hints in query and extract modalities. + + Returns: + (cleaned_query, detected_modalities) + - cleaned_query: query with modality indicators and meta-language removed + - detected_modalities: set of collection names detected from query + """ + query_lower = query.lower() + detected: set[str] = set() + cleaned = query + + # First, detect and strip modality patterns + for pattern, modalities, strip in MODALITY_PATTERNS: + if re.search(pattern, query_lower, re.IGNORECASE): + detected.update(modalities) + if strip: + cleaned = re.sub(pattern, " ", cleaned, flags=re.IGNORECASE) + + # Strip meta-language patterns (regardless of modality detection) + for pattern in META_LANGUAGE_PATTERNS: + cleaned = re.sub(pattern, " ", cleaned, flags=re.IGNORECASE) + + # Clean up whitespace + cleaned = " ".join(cleaned.split()) + + return cleaned, detected + def expand_query(query: str) -> str: """ @@ -332,22 +415,37 @@ async def search_chunks( - useHyde: Enable HyDE query expansion - useReranking: Enable cross-encoder reranking - useQueryExpansion: Enable synonym/abbreviation expansion + - useModalityDetection: Detect content type hints from query """ # Resolve enhancement flags: config overrides global settings use_bm25 = config.useBm25 if config.useBm25 is not None else settings.ENABLE_BM25_SEARCH use_hyde = config.useHyde if config.useHyde is not None else settings.ENABLE_HYDE_EXPANSION use_reranking = config.useReranking if config.useReranking is not None else settings.ENABLE_RERANKING use_query_expansion = config.useQueryExpansion if config.useQueryExpansion is not None else True + use_modality_detection = config.useModalityDetection if config.useModalityDetection is not None else False # Search for more candidates than requested, fuse scores, then return top N # This helps find results that rank well in one method but not the other internal_limit = limit * CANDIDATE_MULTIPLIER - # Extract query text and apply synonym/abbreviation expansion + # Extract query text query_text = " ".join( c for chunk in data for c in chunk.data if isinstance(c, str) ) + # Detect modality hints and clean query if enabled + if use_modality_detection: + cleaned_query, detected_modalities = detect_modality_hints(query_text) + if detected_modalities: + # Override passed modalities with detected ones + modalities = detected_modalities + logger.debug(f"Modality detection: '{query_text[:50]}...' -> modalities={detected_modalities}") + if cleaned_query != query_text: + logger.debug(f"Query cleaning: '{query_text[:50]}...' -> '{cleaned_query[:50]}...'") + query_text = cleaned_query + # Update data with cleaned query for downstream processing + data = [extract.DataChunk(data=[cleaned_query])] + if use_query_expansion: expanded_query = expand_query(query_text) # If query was expanded, use expanded version for search diff --git a/src/memory/api/search/types.py b/src/memory/api/search/types.py index f3bc63a..462db56 100644 --- a/src/memory/api/search/types.py +++ b/src/memory/api/search/types.py @@ -88,6 +88,7 @@ class SearchConfig(BaseModel): useHyde: Optional[bool] = None useReranking: Optional[bool] = None useQueryExpansion: Optional[bool] = None + useModalityDetection: Optional[bool] = None def model_post_init(self, __context) -> None: # Enforce reasonable limits