Add HyDE (Hypothetical Document Embeddings) for query expansion

HyDE generates a hypothetical document passage that would answer the user's query, then embeds that alongside the original query. This bridges the gap between how users describe what they're looking for and the actual document terminology. Changes: - Add hyde.py with expand_query_hyde() function - Integrate HyDE into search_chunks() pipeline - Add ENABLE_HYDE_EXPANSION and HYDE_TIMEOUT settings - Only expand queries with 4+ words (short queries are specific enough) - Simple in-memory cache to avoid re-generating for repeated queries Example: - Query: "saying what you mean not using specific words" - HyDE generates: "Clear communication requires expressing your thoughts directly and honestly, even when you lack the precise terminology..." - This finds articles about word meaning and clear communication 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 09:12:58 +01:00 · 2025-12-20 16:01:03 +00:00 · 2025-12-20 16:01:03 +00:00 · c6cb793cdf
commit c6cb793cdf
parent e414c3311c
3 changed files with 178 additions and 1 deletions
--- a/src/memory/api/search/hyde.py
+++ b/src/memory/api/search/hyde.py
@ -0,0 +1,151 @@
+"""
+HyDE (Hypothetical Document Embeddings) for query expansion.
+
+When users search with vague or conceptual queries like "that article about not using
+specific words", the query embedding may not match the actual document well. HyDE
+generates a hypothetical document that would answer the query, then embeds that
+instead. This bridges the gap between query terminology and document terminology.
+
+Example:
+- Query: "saying what you mean not using specific words"
+- HyDE generates: "An article discussing the technique of 'tabooing' words - replacing
+  specific terms with their definitions to clarify thinking and avoid confused debates..."
+- The hypothetical document embeds closer to the actual "Taboo Your Words" article.
+
+Reference: https://arxiv.org/abs/2212.10496
+"""
+
+import asyncio
+import logging
+from typing import Optional
+
+from memory.common import settings
+from memory.common.llms import create_provider, LLMSettings, Message
+
+logger = logging.getLogger(__name__)
+
+# System prompt for generating hypothetical documents
+HYDE_SYSTEM_PROMPT = """You are a search assistant helping to find documents in a knowledge base.
+Given a user's search query, write a short passage (2-3 sentences) that would appear in a
+document that answers their query. Write as if you are excerpting from an actual article.
+
+Do NOT:
+- Ask clarifying questions
+- Say "I don't know" or "I'm not sure"
+- Include meta-commentary like "This article discusses..."
+- Use phrases like "The document might say..."
+
+DO:
+- Write in the style of the target document (article, blog post, book excerpt)
+- Use specific terminology that would appear in such a document
+- Be concise and direct
+- Include key concepts and vocabulary related to the query"""
+
+# Cache for recent HyDE expansions (simple in-memory cache)
+_hyde_cache: dict[str, str] = {}
+_CACHE_MAX_SIZE = 100
+
+
+async def expand_query_hyde(
+    query: str,
+    model: Optional[str] = None,
+    timeout: float = 5.0,
+) -> Optional[str]:
+    """
+    Expand a query using HyDE (Hypothetical Document Embeddings).
+
+    Generates a hypothetical document passage that would answer the query,
+    which can then be embedded for better semantic matching.
+
+    Args:
+        query: The user's search query
+        model: LLM model to use (defaults to SUMMARIZER_MODEL)
+        timeout: Maximum time to wait for LLM response
+
+    Returns:
+        A hypothetical document passage, or None if generation fails/times out
+    """
+    # Check cache first
+    cache_key = query.lower().strip()
+    if cache_key in _hyde_cache:
+        logger.debug(f"HyDE cache hit for: {query[:50]}...")
+        return _hyde_cache[cache_key]
+
+    try:
+        provider = create_provider(model=model or settings.SUMMARIZER_MODEL)
+
+        messages = [
+            Message.user(text=f"Search query: {query}")
+        ]
+
+        llm_settings = LLMSettings(
+            temperature=0.3,  # Lower temperature for more focused output
+            max_tokens=200,   # Short passages only
+        )
+
+        # Run with timeout
+        hypothetical_doc = await asyncio.wait_for(
+            provider.agenerate(
+                messages=messages,
+                system_prompt=HYDE_SYSTEM_PROMPT,
+                settings=llm_settings,
+            ),
+            timeout=timeout,
+        )
+
+        if hypothetical_doc:
+            hypothetical_doc = hypothetical_doc.strip()
+
+            # Cache the result
+            if len(_hyde_cache) >= _CACHE_MAX_SIZE:
+                # Simple eviction: clear half the cache
+                keys_to_remove = list(_hyde_cache.keys())[:_CACHE_MAX_SIZE // 2]
+                for key in keys_to_remove:
+                    del _hyde_cache[key]
+            _hyde_cache[cache_key] = hypothetical_doc
+
+            logger.debug(f"HyDE expansion: '{query[:30]}...' -> '{hypothetical_doc[:50]}...'")
+            return hypothetical_doc
+
+    except asyncio.TimeoutError:
+        logger.warning(f"HyDE expansion timed out for: {query[:50]}...")
+    except Exception as e:
+        logger.error(f"HyDE expansion failed: {e}")
+
+    return None
+
+
+async def get_hyde_chunks(
+    query: str,
+    model: Optional[str] = None,
+    timeout: float = 5.0,
+) -> list[str]:
+    """
+    Get both original query and HyDE-expanded version for embedding.
+
+    Returns a list containing:
+    1. The original query (always)
+    2. The HyDE-expanded hypothetical document (if generation succeeds)
+
+    This allows the search to match on both the literal query terms
+    and the expanded semantic meaning.
+
+    Args:
+        query: The user's search query
+        model: LLM model to use for HyDE expansion
+        timeout: Maximum time to wait for HyDE generation
+
+    Returns:
+        List of strings to embed (original query + optional HyDE expansion)
+    """
+    chunks = [query]
+
+    # Only expand queries that are vague/conceptual (more than a few words)
+    # Short specific queries like "Taboo Your Words" don't need expansion
+    word_count = len(query.split())
+    if word_count >= 4:
+        hyde_doc = await expand_query_hyde(query, model, timeout)
+        if hyde_doc:
+            chunks.append(hyde_doc)
+
+    return chunks
--- a/src/memory/api/search/search.py
+++ b/src/memory/api/search/search.py
@ -17,6 +17,9 @@ from memory.api.search import scorer
 if settings.ENABLE_BM25_SEARCH:
    from memory.api.search.bm25 import search_bm25_chunks

+if settings.ENABLE_HYDE_EXPANSION:
+    from memory.api.search.hyde import expand_query_hyde
+
 from memory.api.search.types import SearchConfig, SearchFilters, SearchResult

 logger = logging.getLogger(__name__)
@ -87,14 +90,35 @@ async def search_chunks(

    Combines results using weighted score fusion, giving bonus to documents
    that match both semantically and lexically.
+
+    If HyDE is enabled, also generates a hypothetical document from the query
+    and includes it in the embedding search for better semantic matching.
    """
    # Search for more candidates than requested, fuse scores, then return top N
    # This helps find results that rank well in one method but not the other
    internal_limit = limit * CANDIDATE_MULTIPLIER

+    # Extract query text for HyDE expansion
+    search_data = list(data)  # Copy to avoid modifying original
+    if settings.ENABLE_HYDE_EXPANSION:
+        query_text = " ".join(
+            c for chunk in data for c in chunk.data if isinstance(c, str)
+        )
+        # Only expand queries with 4+ words (short queries are usually specific enough)
+        if len(query_text.split()) >= 4:
+            try:
+                hyde_doc = await expand_query_hyde(
+                    query_text, timeout=settings.HYDE_TIMEOUT
+                )
+                if hyde_doc:
+                    logger.debug(f"HyDE expansion: '{query_text[:30]}...' -> '{hyde_doc[:50]}...'")
+                    search_data.append(extract.DataChunk(data=[hyde_doc]))
+            except Exception as e:
+                logger.warning(f"HyDE expansion failed, using original query: {e}")
+
    # Run embedding search
    embedding_scores = await search_chunks_embeddings(
-        data, modalities, internal_limit, filters, timeout
+        search_data, modalities, internal_limit, filters, timeout
    )

    # Run BM25 search if enabled
--- a/src/memory/common/settings.py
+++ b/src/memory/common/settings.py
@ -176,6 +176,8 @@ LLM_USAGE_REDIS_PREFIX = os.getenv("LLM_USAGE_REDIS_PREFIX", "llm_usage")
 ENABLE_EMBEDDING_SEARCH = boolean_env("ENABLE_EMBEDDING_SEARCH", True)
 ENABLE_BM25_SEARCH = boolean_env("ENABLE_BM25_SEARCH", True)
 ENABLE_SEARCH_SCORING = boolean_env("ENABLE_SEARCH_SCORING", True)
+ENABLE_HYDE_EXPANSION = boolean_env("ENABLE_HYDE_EXPANSION", True)
+HYDE_TIMEOUT = float(os.getenv("HYDE_TIMEOUT", "3.0"))
 MAX_PREVIEW_LENGTH = int(os.getenv("MAX_PREVIEW_LENGTH", DEFAULT_CHUNK_TOKENS * 16))
 MAX_NON_PREVIEW_LENGTH = int(os.getenv("MAX_NON_PREVIEW_LENGTH", 2000))