diff --git a/src/memory/api/search/hyde.py b/src/memory/api/search/hyde.py new file mode 100644 index 0000000..ed523a2 --- /dev/null +++ b/src/memory/api/search/hyde.py @@ -0,0 +1,151 @@ +""" +HyDE (Hypothetical Document Embeddings) for query expansion. + +When users search with vague or conceptual queries like "that article about not using +specific words", the query embedding may not match the actual document well. HyDE +generates a hypothetical document that would answer the query, then embeds that +instead. This bridges the gap between query terminology and document terminology. + +Example: +- Query: "saying what you mean not using specific words" +- HyDE generates: "An article discussing the technique of 'tabooing' words - replacing + specific terms with their definitions to clarify thinking and avoid confused debates..." +- The hypothetical document embeds closer to the actual "Taboo Your Words" article. + +Reference: https://arxiv.org/abs/2212.10496 +""" + +import asyncio +import logging +from typing import Optional + +from memory.common import settings +from memory.common.llms import create_provider, LLMSettings, Message + +logger = logging.getLogger(__name__) + +# System prompt for generating hypothetical documents +HYDE_SYSTEM_PROMPT = """You are a search assistant helping to find documents in a knowledge base. +Given a user's search query, write a short passage (2-3 sentences) that would appear in a +document that answers their query. Write as if you are excerpting from an actual article. + +Do NOT: +- Ask clarifying questions +- Say "I don't know" or "I'm not sure" +- Include meta-commentary like "This article discusses..." +- Use phrases like "The document might say..." + +DO: +- Write in the style of the target document (article, blog post, book excerpt) +- Use specific terminology that would appear in such a document +- Be concise and direct +- Include key concepts and vocabulary related to the query""" + +# Cache for recent HyDE expansions (simple in-memory cache) +_hyde_cache: dict[str, str] = {} +_CACHE_MAX_SIZE = 100 + + +async def expand_query_hyde( + query: str, + model: Optional[str] = None, + timeout: float = 5.0, +) -> Optional[str]: + """ + Expand a query using HyDE (Hypothetical Document Embeddings). + + Generates a hypothetical document passage that would answer the query, + which can then be embedded for better semantic matching. + + Args: + query: The user's search query + model: LLM model to use (defaults to SUMMARIZER_MODEL) + timeout: Maximum time to wait for LLM response + + Returns: + A hypothetical document passage, or None if generation fails/times out + """ + # Check cache first + cache_key = query.lower().strip() + if cache_key in _hyde_cache: + logger.debug(f"HyDE cache hit for: {query[:50]}...") + return _hyde_cache[cache_key] + + try: + provider = create_provider(model=model or settings.SUMMARIZER_MODEL) + + messages = [ + Message.user(text=f"Search query: {query}") + ] + + llm_settings = LLMSettings( + temperature=0.3, # Lower temperature for more focused output + max_tokens=200, # Short passages only + ) + + # Run with timeout + hypothetical_doc = await asyncio.wait_for( + provider.agenerate( + messages=messages, + system_prompt=HYDE_SYSTEM_PROMPT, + settings=llm_settings, + ), + timeout=timeout, + ) + + if hypothetical_doc: + hypothetical_doc = hypothetical_doc.strip() + + # Cache the result + if len(_hyde_cache) >= _CACHE_MAX_SIZE: + # Simple eviction: clear half the cache + keys_to_remove = list(_hyde_cache.keys())[:_CACHE_MAX_SIZE // 2] + for key in keys_to_remove: + del _hyde_cache[key] + _hyde_cache[cache_key] = hypothetical_doc + + logger.debug(f"HyDE expansion: '{query[:30]}...' -> '{hypothetical_doc[:50]}...'") + return hypothetical_doc + + except asyncio.TimeoutError: + logger.warning(f"HyDE expansion timed out for: {query[:50]}...") + except Exception as e: + logger.error(f"HyDE expansion failed: {e}") + + return None + + +async def get_hyde_chunks( + query: str, + model: Optional[str] = None, + timeout: float = 5.0, +) -> list[str]: + """ + Get both original query and HyDE-expanded version for embedding. + + Returns a list containing: + 1. The original query (always) + 2. The HyDE-expanded hypothetical document (if generation succeeds) + + This allows the search to match on both the literal query terms + and the expanded semantic meaning. + + Args: + query: The user's search query + model: LLM model to use for HyDE expansion + timeout: Maximum time to wait for HyDE generation + + Returns: + List of strings to embed (original query + optional HyDE expansion) + """ + chunks = [query] + + # Only expand queries that are vague/conceptual (more than a few words) + # Short specific queries like "Taboo Your Words" don't need expansion + word_count = len(query.split()) + if word_count >= 4: + hyde_doc = await expand_query_hyde(query, model, timeout) + if hyde_doc: + chunks.append(hyde_doc) + + return chunks diff --git a/src/memory/api/search/search.py b/src/memory/api/search/search.py index 00dcb8f..ba7f796 100644 --- a/src/memory/api/search/search.py +++ b/src/memory/api/search/search.py @@ -17,6 +17,9 @@ from memory.api.search import scorer if settings.ENABLE_BM25_SEARCH: from memory.api.search.bm25 import search_bm25_chunks +if settings.ENABLE_HYDE_EXPANSION: + from memory.api.search.hyde import expand_query_hyde + from memory.api.search.types import SearchConfig, SearchFilters, SearchResult logger = logging.getLogger(__name__) @@ -87,14 +90,35 @@ async def search_chunks( Combines results using weighted score fusion, giving bonus to documents that match both semantically and lexically. + + If HyDE is enabled, also generates a hypothetical document from the query + and includes it in the embedding search for better semantic matching. """ # Search for more candidates than requested, fuse scores, then return top N # This helps find results that rank well in one method but not the other internal_limit = limit * CANDIDATE_MULTIPLIER + # Extract query text for HyDE expansion + search_data = list(data) # Copy to avoid modifying original + if settings.ENABLE_HYDE_EXPANSION: + query_text = " ".join( + c for chunk in data for c in chunk.data if isinstance(c, str) + ) + # Only expand queries with 4+ words (short queries are usually specific enough) + if len(query_text.split()) >= 4: + try: + hyde_doc = await expand_query_hyde( + query_text, timeout=settings.HYDE_TIMEOUT + ) + if hyde_doc: + logger.debug(f"HyDE expansion: '{query_text[:30]}...' -> '{hyde_doc[:50]}...'") + search_data.append(extract.DataChunk(data=[hyde_doc])) + except Exception as e: + logger.warning(f"HyDE expansion failed, using original query: {e}") + # Run embedding search embedding_scores = await search_chunks_embeddings( - data, modalities, internal_limit, filters, timeout + search_data, modalities, internal_limit, filters, timeout ) # Run BM25 search if enabled diff --git a/src/memory/common/settings.py b/src/memory/common/settings.py index dce52f0..0b4f457 100644 --- a/src/memory/common/settings.py +++ b/src/memory/common/settings.py @@ -176,6 +176,8 @@ LLM_USAGE_REDIS_PREFIX = os.getenv("LLM_USAGE_REDIS_PREFIX", "llm_usage") ENABLE_EMBEDDING_SEARCH = boolean_env("ENABLE_EMBEDDING_SEARCH", True) ENABLE_BM25_SEARCH = boolean_env("ENABLE_BM25_SEARCH", True) ENABLE_SEARCH_SCORING = boolean_env("ENABLE_SEARCH_SCORING", True) +ENABLE_HYDE_EXPANSION = boolean_env("ENABLE_HYDE_EXPANSION", True) +HYDE_TIMEOUT = float(os.getenv("HYDE_TIMEOUT", "3.0")) MAX_PREVIEW_LENGTH = int(os.getenv("MAX_PREVIEW_LENGTH", DEFAULT_CHUNK_TOKENS * 16)) MAX_NON_PREVIEW_LENGTH = int(os.getenv("MAX_NON_PREVIEW_LENGTH", 2000))