Add HyDE (Hypothetical Document Embeddings) for query expansion

HyDE generates a hypothetical document passage that would answer the user's
query, then embeds that alongside the original query. This bridges the gap
between how users describe what they're looking for and the actual document
terminology.

Changes:
- Add hyde.py with expand_query_hyde() function
- Integrate HyDE into search_chunks() pipeline
- Add ENABLE_HYDE_EXPANSION and HYDE_TIMEOUT settings
- Only expand queries with 4+ words (short queries are specific enough)
- Simple in-memory cache to avoid re-generating for repeated queries

Example:
- Query: "saying what you mean not using specific words"
- HyDE generates: "Clear communication requires expressing your thoughts
  directly and honestly, even when you lack the precise terminology..."
- This finds articles about word meaning and clear communication

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
mruwnik 2025-12-20 16:01:03 +00:00
parent e414c3311c
commit c6cb793cdf
3 changed files with 178 additions and 1 deletions

View File

@ -0,0 +1,151 @@
"""
HyDE (Hypothetical Document Embeddings) for query expansion.
When users search with vague or conceptual queries like "that article about not using
specific words", the query embedding may not match the actual document well. HyDE
generates a hypothetical document that would answer the query, then embeds that
instead. This bridges the gap between query terminology and document terminology.
Example:
- Query: "saying what you mean not using specific words"
- HyDE generates: "An article discussing the technique of 'tabooing' words - replacing
specific terms with their definitions to clarify thinking and avoid confused debates..."
- The hypothetical document embeds closer to the actual "Taboo Your Words" article.
Reference: https://arxiv.org/abs/2212.10496
"""
import asyncio
import logging
from typing import Optional
from memory.common import settings
from memory.common.llms import create_provider, LLMSettings, Message
logger = logging.getLogger(__name__)
# System prompt for generating hypothetical documents
HYDE_SYSTEM_PROMPT = """You are a search assistant helping to find documents in a knowledge base.
Given a user's search query, write a short passage (2-3 sentences) that would appear in a
document that answers their query. Write as if you are excerpting from an actual article.
Do NOT:
- Ask clarifying questions
- Say "I don't know" or "I'm not sure"
- Include meta-commentary like "This article discusses..."
- Use phrases like "The document might say..."
DO:
- Write in the style of the target document (article, blog post, book excerpt)
- Use specific terminology that would appear in such a document
- Be concise and direct
- Include key concepts and vocabulary related to the query"""
# Cache for recent HyDE expansions (simple in-memory cache)
_hyde_cache: dict[str, str] = {}
_CACHE_MAX_SIZE = 100
async def expand_query_hyde(
query: str,
model: Optional[str] = None,
timeout: float = 5.0,
) -> Optional[str]:
"""
Expand a query using HyDE (Hypothetical Document Embeddings).
Generates a hypothetical document passage that would answer the query,
which can then be embedded for better semantic matching.
Args:
query: The user's search query
model: LLM model to use (defaults to SUMMARIZER_MODEL)
timeout: Maximum time to wait for LLM response
Returns:
A hypothetical document passage, or None if generation fails/times out
"""
# Check cache first
cache_key = query.lower().strip()
if cache_key in _hyde_cache:
logger.debug(f"HyDE cache hit for: {query[:50]}...")
return _hyde_cache[cache_key]
try:
provider = create_provider(model=model or settings.SUMMARIZER_MODEL)
messages = [
Message.user(text=f"Search query: {query}")
]
llm_settings = LLMSettings(
temperature=0.3, # Lower temperature for more focused output
max_tokens=200, # Short passages only
)
# Run with timeout
hypothetical_doc = await asyncio.wait_for(
provider.agenerate(
messages=messages,
system_prompt=HYDE_SYSTEM_PROMPT,
settings=llm_settings,
),
timeout=timeout,
)
if hypothetical_doc:
hypothetical_doc = hypothetical_doc.strip()
# Cache the result
if len(_hyde_cache) >= _CACHE_MAX_SIZE:
# Simple eviction: clear half the cache
keys_to_remove = list(_hyde_cache.keys())[:_CACHE_MAX_SIZE // 2]
for key in keys_to_remove:
del _hyde_cache[key]
_hyde_cache[cache_key] = hypothetical_doc
logger.debug(f"HyDE expansion: '{query[:30]}...' -> '{hypothetical_doc[:50]}...'")
return hypothetical_doc
except asyncio.TimeoutError:
logger.warning(f"HyDE expansion timed out for: {query[:50]}...")
except Exception as e:
logger.error(f"HyDE expansion failed: {e}")
return None
async def get_hyde_chunks(
query: str,
model: Optional[str] = None,
timeout: float = 5.0,
) -> list[str]:
"""
Get both original query and HyDE-expanded version for embedding.
Returns a list containing:
1. The original query (always)
2. The HyDE-expanded hypothetical document (if generation succeeds)
This allows the search to match on both the literal query terms
and the expanded semantic meaning.
Args:
query: The user's search query
model: LLM model to use for HyDE expansion
timeout: Maximum time to wait for HyDE generation
Returns:
List of strings to embed (original query + optional HyDE expansion)
"""
chunks = [query]
# Only expand queries that are vague/conceptual (more than a few words)
# Short specific queries like "Taboo Your Words" don't need expansion
word_count = len(query.split())
if word_count >= 4:
hyde_doc = await expand_query_hyde(query, model, timeout)
if hyde_doc:
chunks.append(hyde_doc)
return chunks

View File

@ -17,6 +17,9 @@ from memory.api.search import scorer
if settings.ENABLE_BM25_SEARCH:
from memory.api.search.bm25 import search_bm25_chunks
if settings.ENABLE_HYDE_EXPANSION:
from memory.api.search.hyde import expand_query_hyde
from memory.api.search.types import SearchConfig, SearchFilters, SearchResult
logger = logging.getLogger(__name__)
@ -87,14 +90,35 @@ async def search_chunks(
Combines results using weighted score fusion, giving bonus to documents
that match both semantically and lexically.
If HyDE is enabled, also generates a hypothetical document from the query
and includes it in the embedding search for better semantic matching.
"""
# Search for more candidates than requested, fuse scores, then return top N
# This helps find results that rank well in one method but not the other
internal_limit = limit * CANDIDATE_MULTIPLIER
# Extract query text for HyDE expansion
search_data = list(data) # Copy to avoid modifying original
if settings.ENABLE_HYDE_EXPANSION:
query_text = " ".join(
c for chunk in data for c in chunk.data if isinstance(c, str)
)
# Only expand queries with 4+ words (short queries are usually specific enough)
if len(query_text.split()) >= 4:
try:
hyde_doc = await expand_query_hyde(
query_text, timeout=settings.HYDE_TIMEOUT
)
if hyde_doc:
logger.debug(f"HyDE expansion: '{query_text[:30]}...' -> '{hyde_doc[:50]}...'")
search_data.append(extract.DataChunk(data=[hyde_doc]))
except Exception as e:
logger.warning(f"HyDE expansion failed, using original query: {e}")
# Run embedding search
embedding_scores = await search_chunks_embeddings(
data, modalities, internal_limit, filters, timeout
search_data, modalities, internal_limit, filters, timeout
)
# Run BM25 search if enabled

View File

@ -176,6 +176,8 @@ LLM_USAGE_REDIS_PREFIX = os.getenv("LLM_USAGE_REDIS_PREFIX", "llm_usage")
ENABLE_EMBEDDING_SEARCH = boolean_env("ENABLE_EMBEDDING_SEARCH", True)
ENABLE_BM25_SEARCH = boolean_env("ENABLE_BM25_SEARCH", True)
ENABLE_SEARCH_SCORING = boolean_env("ENABLE_SEARCH_SCORING", True)
ENABLE_HYDE_EXPANSION = boolean_env("ENABLE_HYDE_EXPANSION", True)
HYDE_TIMEOUT = float(os.getenv("HYDE_TIMEOUT", "3.0"))
MAX_PREVIEW_LENGTH = int(os.getenv("MAX_PREVIEW_LENGTH", DEFAULT_CHUNK_TOKENS * 16))
MAX_NON_PREVIEW_LENGTH = int(os.getenv("MAX_NON_PREVIEW_LENGTH", 2000))