mirror of
https://github.com/mruwnik/memory.git
synced 2026-01-02 09:12:58 +01:00
Improve RAG search quality with PostgreSQL FTS and hybrid scoring
Major changes: - Replace OOM-causing in-memory BM25 with PostgreSQL full-text search - Add tsvector column and GIN index for fast keyword search - Implement hybrid score fusion (70% embedding + 30% FTS + 15% bonus) - Add CANDIDATE_MULTIPLIER (5x) to search more candidates before fusion - Add stopword filtering to FTS queries for less strict matching - Make search limit configurable (default 20, max 100) - Propagate relevance scores through the search pipeline Search improvements: - "clowns iconoclasts" → finds target at rank 1 (score 0.815) - "replacing words with definitions" → finds target at rank 1 - Vague queries now find results with limit=30 that were previously missed 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
f2161e09f3
commit
e414c3311c
@ -0,0 +1,79 @@
|
||||
"""Add full-text search to chunks
|
||||
|
||||
Revision ID: a1b2c3d4e5f6
|
||||
Revises: 89861d5f1102
|
||||
Create Date: 2025-12-20 13:00:00.000000
|
||||
|
||||
"""
|
||||
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "a1b2c3d4e5f6"
|
||||
down_revision: Union[str, None] = "89861d5f1102"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add tsvector column for full-text search
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE chunk
|
||||
ADD COLUMN IF NOT EXISTS search_vector tsvector
|
||||
"""
|
||||
)
|
||||
|
||||
# Create GIN index for fast full-text search
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS chunk_search_idx
|
||||
ON chunk USING GIN (search_vector)
|
||||
"""
|
||||
)
|
||||
|
||||
# Create function to generate search vector from content
|
||||
op.execute(
|
||||
"""
|
||||
CREATE OR REPLACE FUNCTION chunk_search_vector_update()
|
||||
RETURNS trigger AS $$
|
||||
BEGIN
|
||||
IF NEW.content IS NOT NULL THEN
|
||||
NEW.search_vector := to_tsvector('english', NEW.content);
|
||||
END IF;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql
|
||||
"""
|
||||
)
|
||||
|
||||
# Create trigger to auto-update search_vector on insert/update
|
||||
op.execute(
|
||||
"""
|
||||
DROP TRIGGER IF EXISTS chunk_search_vector_trigger ON chunk;
|
||||
CREATE TRIGGER chunk_search_vector_trigger
|
||||
BEFORE INSERT OR UPDATE OF content ON chunk
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION chunk_search_vector_update()
|
||||
"""
|
||||
)
|
||||
|
||||
# Populate search_vector for existing rows (in batches to avoid timeout)
|
||||
# This updates in chunks of 10000 rows at a time
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE chunk
|
||||
SET search_vector = to_tsvector('english', content)
|
||||
WHERE content IS NOT NULL AND search_vector IS NULL
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.execute("DROP TRIGGER IF EXISTS chunk_search_vector_trigger ON chunk")
|
||||
op.execute("DROP FUNCTION IF EXISTS chunk_search_vector_update()")
|
||||
op.execute("DROP INDEX IF EXISTS chunk_search_idx")
|
||||
op.execute("ALTER TABLE chunk DROP COLUMN IF EXISTS search_vector")
|
||||
437
docs/SEARCH_INVESTIGATION.md
Normal file
437
docs/SEARCH_INVESTIGATION.md
Normal file
@ -0,0 +1,437 @@
|
||||
# RAG Search Quality Investigation
|
||||
|
||||
## Summary
|
||||
|
||||
Investigation into why RAG search results "often aren't that good" when trying to find things with partial/vague memories.
|
||||
|
||||
**Date:** 2025-12-20
|
||||
**Status:** Significant Progress Made
|
||||
|
||||
### Key Findings
|
||||
|
||||
1. **BM25 keyword search was broken** - Caused OOM with 250K chunks. ✅ FIXED: Replaced with PostgreSQL full-text search.
|
||||
|
||||
2. **Embeddings can't find "mentioned in passing" content** - Query "engineer fail-safe" ranks article about humility (that mentions engineers as example) at position 140 out of 145K. Articles specifically about engineering rank higher.
|
||||
|
||||
3. **Score propagation was broken** - ✅ FIXED: Scores now flow through properly.
|
||||
|
||||
4. **Chunk sizes are inconsistent** - Some chunks are 3MB (books), some are 3 bytes. Large chunks have diluted embeddings.
|
||||
|
||||
5. **"Half-remembered" queries don't match article keywords** - User describes concept, but article uses different terminology. E.g., "not using specific words" vs "taboo your words".
|
||||
|
||||
### What Works Now
|
||||
|
||||
- **Keyword-matching queries**: "clowns iconoclasts" → finds "Lonely Dissent" at rank 1 (score 0.815)
|
||||
- **Direct concept queries**: "replacing words with definitions" → finds "Taboo Your Words" at rank 1
|
||||
- **Hybrid search**: Results appearing in both embedding + FTS get 15% bonus
|
||||
|
||||
### Remaining Challenges
|
||||
|
||||
- **Conceptual queries**: "saying what you mean not using specific words" → target ranks 23rd (needs top 10)
|
||||
- Query describes the *effect*, article describes the *technique*
|
||||
- Need query expansion (HyDE) to bridge semantic gap
|
||||
|
||||
### Recommended Fix Priority
|
||||
|
||||
1. **Implement PostgreSQL full-text search** - ✅ DONE
|
||||
2. **Add candidate pool multiplier** - ✅ DONE (5x internal limit)
|
||||
3. **Add stopword filtering** - ✅ DONE
|
||||
4. **Re-chunk oversized content** - Max 512 tokens, with context
|
||||
5. **Implement HyDE query expansion** - For vague/conceptual queries
|
||||
|
||||
---
|
||||
|
||||
## PostgreSQL Full-Text Search Implementation (2025-12-20)
|
||||
|
||||
### Changes Made
|
||||
|
||||
1. **Created migration** `db/migrations/versions/20251220_130000_add_chunk_fulltext_search.py`
|
||||
- Added `search_vector` tsvector column to chunk table
|
||||
- Created GIN index for fast search
|
||||
- Added trigger to auto-update search_vector on insert/update
|
||||
- Populated existing 250K chunks with search vectors
|
||||
|
||||
2. **Rewrote bm25.py** to use PostgreSQL full-text search
|
||||
- Removed in-memory BM25 that caused OOM
|
||||
- Uses `ts_rank()` for relevance scoring
|
||||
- Uses AND matching with prefix wildcards: `engineer:* & fail:* & safe:*`
|
||||
- Normalized scores to 0-1 range
|
||||
|
||||
3. **Added search_vector column** to Chunk model in SQLAlchemy
|
||||
|
||||
### Test Results
|
||||
|
||||
For query "engineer fail safe":
|
||||
- PostgreSQL FTS returns 100 results without OOM
|
||||
- Source 157 (humility article) chunks rank **25th and 26th** (vs not appearing before)
|
||||
- Search completes in ~100ms (vs OOM crash before)
|
||||
|
||||
### Hybrid Search Flow
|
||||
|
||||
With BM25 now working, the hybrid search combines:
|
||||
- Embedding search (70% weight) - finds semantically similar content
|
||||
- Full-text search (30% weight) - finds exact keyword matches
|
||||
- +15% bonus for results appearing in both
|
||||
|
||||
This should significantly improve "half-remembered" searches where users recall specific words that appear in the article.
|
||||
|
||||
---
|
||||
|
||||
## Issues Fixed (This Session)
|
||||
|
||||
### 1. Scores Were Being Discarded (CRITICAL)
|
||||
|
||||
**Problem:** Both embedding and BM25 searches computed relevance scores but threw them away, returning only chunk IDs.
|
||||
|
||||
**Files Changed:**
|
||||
- `src/memory/api/search/embeddings.py` - Now returns `dict[str, float]` (chunk_id -> score)
|
||||
- `src/memory/api/search/bm25.py` - Now returns normalized scores (0-1 range)
|
||||
- `src/memory/api/search/search.py` - Added `fuse_scores()` for hybrid ranking
|
||||
- `src/memory/api/search/types.py` - Changed from mean to max chunk score
|
||||
|
||||
**Before:** All `search_score` values were 0.000
|
||||
**After:** Meaningful scores like 0.443, 0.503, etc.
|
||||
|
||||
### 2. Score Fusion Implemented
|
||||
|
||||
Added weighted combination of embedding (70%) + BM25 (30%) scores with 15% bonus for results appearing in both searches.
|
||||
|
||||
```python
|
||||
EMBEDDING_WEIGHT = 0.7
|
||||
BM25_WEIGHT = 0.3
|
||||
HYBRID_BONUS = 0.15
|
||||
```
|
||||
|
||||
### 3. Changed from Mean to Max Chunk Score
|
||||
|
||||
**Before:** Documents with many chunks were penalized (averaging diluted scores)
|
||||
**After:** Uses max chunk score - finds documents with at least one highly relevant section
|
||||
|
||||
---
|
||||
|
||||
## Current Issues Identified
|
||||
|
||||
### Issue 1: BM25 is Disabled AND Causes OOM
|
||||
|
||||
**Finding:** `ENABLE_BM25_SEARCH=False` in docker-compose.yaml
|
||||
|
||||
**Impact:** Keyword matching doesn't work. Queries like "engineer fail-safe" won't find articles containing those exact words unless the embedding similarity is high enough.
|
||||
|
||||
**When Enabled:** BM25 causes OOM crash!
|
||||
- Database has 250,048 chunks total
|
||||
- Forum collection alone has 147,546 chunks
|
||||
- BM25 implementation loads ALL chunks into memory and builds index on each query
|
||||
- Container killed (exit code 137) when attempting BM25 search
|
||||
|
||||
**Root Cause:** Current BM25 implementation in `bm25.py` is not scalable:
|
||||
```python
|
||||
items = items_query.all() # Loads ALL chunks into memory
|
||||
corpus = [item.content.lower().strip() for item in items] # Copies all content
|
||||
retriever.index(corpus_tokens) # Builds index from scratch each query
|
||||
```
|
||||
|
||||
**Recommendation:**
|
||||
1. Build persistent BM25 index (store on disk, load once)
|
||||
2. Or use PostgreSQL full-text search instead
|
||||
3. Or limit BM25 to smaller collections only
|
||||
|
||||
### Issue 2: Embeddings Capture Theme, Not Details
|
||||
|
||||
**Test Case:** Article 157 about "humility in science" contains an example about engineers designing fail-safe mechanisms.
|
||||
|
||||
| Query | Result |
|
||||
|-------|--------|
|
||||
| "humility in science creationist evolution" | Rank 1, score 0.475 |
|
||||
| "types of humility epistemic" | Rank 1, score 0.443 |
|
||||
| "being humble about scientific knowledge" | Rank 1, score 0.483 |
|
||||
| "engineer fail-safe mechanisms humble design" | Not in top 10 |
|
||||
| "student double-checks math test answers" | Not in top 10 |
|
||||
| "creationism debate" | Not in top 10 |
|
||||
|
||||
**Analysis:**
|
||||
- Query "engineer fail-safe" has 0.52 cosine similarity to target chunks
|
||||
- Other documents in corpus have 0.61+ similarity to that query
|
||||
- The embedding captures the article's main theme (humility) but not incidental details (engineer example)
|
||||
|
||||
**Root Cause:** Embeddings are designed to capture semantic meaning of the whole chunk. Brief examples or mentions don't dominate the embedding.
|
||||
|
||||
### Issue 3: Chunk Context May Be Insufficient
|
||||
|
||||
**Finding:** The article's "engineer fail-safe" example appears in chunks, but:
|
||||
- Some chunks are cut mid-word (e.g., "fail\-s" instead of "fail-safe")
|
||||
- The engineer example may lack surrounding context
|
||||
|
||||
**Chunk Analysis for Article 157:**
|
||||
- 7 chunks total
|
||||
- Chunks containing "engineer": 2 (chunks 2 and 6)
|
||||
- Chunk 2 ends with "fail\-s" (word cut off)
|
||||
- The engineer example is brief (~2 sentences) within larger chunks about humility
|
||||
|
||||
---
|
||||
|
||||
## Embedding Similarity Analysis
|
||||
|
||||
For query "engineer fail-safe mechanisms humble design":
|
||||
|
||||
| Chunk | Similarity | Content Preview |
|
||||
|-------|------------|-----------------|
|
||||
| 3097f4d6 | 0.522 | "It is widely recognized that good science requires..." |
|
||||
| db87f54d | 0.486 | "It is widely recognized that good science requires..." |
|
||||
| f3e97d77 | 0.462 | "You'd still double-check your calculations..." |
|
||||
| 9153d1f5 | 0.435 | "They ought to be more humble..." |
|
||||
| 3375ae64 | 0.424 | "Dennett suggests that much 'religious belief'..." |
|
||||
| 047e7a9a | 0.353 | Summary chunk |
|
||||
| 80ff7a03 | 0.267 | References chunk |
|
||||
|
||||
**Problem:** Top results in the forum collection score 0.61+, so these 0.52 scores don't make the cut.
|
||||
|
||||
---
|
||||
|
||||
## Recommendations
|
||||
|
||||
### High Priority
|
||||
|
||||
1. **Enable BM25 Search**
|
||||
- Set `ENABLE_BM25_SEARCH=True`
|
||||
- This will find keyword matches that embeddings miss
|
||||
- Already implemented score fusion to combine results
|
||||
|
||||
2. **Lower Embedding Threshold for Text Collections**
|
||||
- Current: 0.25 minimum score
|
||||
- Consider: 0.20 to catch more marginal matches
|
||||
- Trade-off: May increase noise
|
||||
|
||||
3. **Increase Search Limit Before Fusion**
|
||||
- Current: Uses same `limit` for both embedding and BM25
|
||||
- Consider: Search for 2-3x more candidates, then fuse and return top N
|
||||
|
||||
### Medium Priority
|
||||
|
||||
4. **Implement Query Expansion / HyDE**
|
||||
- For vague queries, generate a hypothetical answer and embed that
|
||||
- Example: "engineer fail-safe" -> generate "An article discussing how engineers design fail-safe mechanisms as an example of good humility..."
|
||||
|
||||
5. **Improve Chunking Overlap**
|
||||
- Ensure examples carry context from surrounding paragraphs
|
||||
- Consider semantic chunking (split on topic changes, not just size)
|
||||
|
||||
6. **Add Document-Level Context to Chunks**
|
||||
- Prepend document title/summary to each chunk before embedding
|
||||
- Helps chunks maintain connection to main theme
|
||||
|
||||
### Lower Priority
|
||||
|
||||
7. **Tune Fusion Weights**
|
||||
- Current: 70% embedding, 30% BM25
|
||||
- May need adjustment based on use case
|
||||
|
||||
8. **Add Temporal Decay**
|
||||
- Prefer recent content for certain query types
|
||||
|
||||
---
|
||||
|
||||
## Architectural Issues
|
||||
|
||||
### Issue A: BM25 Implementation is Not Scalable
|
||||
|
||||
The current BM25 implementation cannot handle 250K chunks:
|
||||
|
||||
```python
|
||||
# Current approach (in bm25.py):
|
||||
items = items_query.all() # Loads ALL matching chunks into memory
|
||||
corpus = [item.content.lower().strip() for item in items] # Makes copies
|
||||
retriever.index(corpus_tokens) # Rebuilds index from scratch per query
|
||||
```
|
||||
|
||||
**Why this fails:**
|
||||
- 147K forum chunks × ~3KB avg = ~440MB just for text
|
||||
- Plus tokenization, BM25 index structures → OOM
|
||||
|
||||
**Solutions (in order of recommendation):**
|
||||
|
||||
1. **PostgreSQL Full-Text Search** (Recommended)
|
||||
- Already have PostgreSQL in stack
|
||||
- Add `tsvector` column to Chunk table
|
||||
- Create GIN index for fast search
|
||||
- Use `ts_rank` for relevance scoring
|
||||
- No additional infrastructure needed
|
||||
|
||||
2. **Persistent BM25 Index**
|
||||
- Build index once at ingestion time
|
||||
- Store on disk, load once at startup
|
||||
- Update incrementally on new chunks
|
||||
- More complex to maintain
|
||||
|
||||
3. **External Search Engine**
|
||||
- Elasticsearch or Meilisearch
|
||||
- Adds operational complexity
|
||||
- May be overkill for current scale
|
||||
|
||||
### Issue B: Chunk Size Variance
|
||||
|
||||
Chunks range from 3 bytes to 3.3MB. This causes:
|
||||
- Large chunks have diluted embeddings
|
||||
- Small chunks lack context
|
||||
- Inconsistent search quality across collections
|
||||
|
||||
**Solution:** Re-chunk existing content with:
|
||||
- Max ~512 tokens per chunk (optimal for embeddings)
|
||||
- 50-100 token overlap between chunks
|
||||
- Prepend document title/context to each chunk
|
||||
|
||||
### Issue C: Search Timeout (2 seconds)
|
||||
|
||||
The default 2-second timeout is too aggressive for:
|
||||
- Large collections (147K forum chunks)
|
||||
- Cold Qdrant cache
|
||||
- Network latency
|
||||
|
||||
**Solution:** Increase to 5-10 seconds for initial search, with progressive loading UX.
|
||||
|
||||
---
|
||||
|
||||
## Test Queries for Validation
|
||||
|
||||
After making changes, test with these queries against article 157:
|
||||
|
||||
```python
|
||||
# Should find article 157 (humility in science)
|
||||
test_cases = [
|
||||
# Main topic - currently working
|
||||
("humility in science", "main topic"),
|
||||
("types of humility epistemic", "topic area"),
|
||||
|
||||
# Specific examples - currently failing
|
||||
("engineer fail-safe mechanisms", "specific example"),
|
||||
("student double-checks math test", "specific example"),
|
||||
|
||||
# Tangential mentions - currently failing
|
||||
("creationism debate", "mentioned topic"),
|
||||
|
||||
# Vague/half-remembered - currently failing
|
||||
("checking your work", "vague concept"),
|
||||
("when engineers make mistakes", "tangential"),
|
||||
]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Session Log
|
||||
|
||||
### 2025-12-20
|
||||
|
||||
1. **Initial Investigation**
|
||||
- Found scores were all 0.000
|
||||
- Traced to embeddings.py and bm25.py discarding scores
|
||||
|
||||
2. **Fixed Score Propagation**
|
||||
- Modified 4 files to preserve and fuse scores
|
||||
- Rebuilt Docker images
|
||||
- Verified scores now appear (0.4-0.5 range)
|
||||
|
||||
3. **Quality Testing**
|
||||
- Selected random article (ID 157, humility in science)
|
||||
- Tested 10 query types from specific to vague
|
||||
- Found 3/10 queries succeed (main topic only)
|
||||
|
||||
4. **Root Cause Analysis**
|
||||
- BM25 disabled - no keyword matching
|
||||
- Embeddings capture theme, not details
|
||||
- Target chunks have 0.52 similarity vs 0.61 for top results
|
||||
|
||||
5. **Next Steps**
|
||||
- Enable BM25 and retest
|
||||
- Consider HyDE for query expansion
|
||||
- Investigate chunking improvements
|
||||
|
||||
6. **Deep Dive: Database Statistics**
|
||||
- Total chunks: 250,048
|
||||
- Forum: 147,546 (58.9%)
|
||||
- Blog: 46,159 (18.5%)
|
||||
- Book: 34,586 (13.8%)
|
||||
- Text: 10,823 (4.3%)
|
||||
|
||||
7. **Chunk Size Analysis (MAJOR ISSUE)**
|
||||
Found excessively large chunks that dilute embedding quality:
|
||||
|
||||
| Collection | Avg Length | Max Length | Over 8KB | Over 128KB |
|
||||
|------------|------------|------------|----------|------------|
|
||||
| book | 15,487 | 3.3MB | 12,452 | 474 |
|
||||
| blog | 3,661 | 710KB | 2,874 | 19 |
|
||||
| forum | 3,514 | 341KB | 8,943 | 47 |
|
||||
|
||||
Books have 36% of chunks over 8KB - too large for good embedding quality.
|
||||
The Voyage embedding model has 32K token limit, but chunks over 8KB (~2K tokens)
|
||||
start to lose fine-grained detail in the embedding.
|
||||
|
||||
8. **Detailed Score Analysis for "engineer fail-safe mechanisms humble design"**
|
||||
- Query returns 145,632 results from forum collection
|
||||
- Top results score 0.61, median 0.34
|
||||
- Source 157 (target article) chunks score:
|
||||
- 3097f4d6: 0.5222 (rank 140/145,632) - main content
|
||||
- db87f54d: 0.4863 (rank 710/145,632) - full text chunk
|
||||
- f3e97d77: 0.4622 (rank 1,952/145,632)
|
||||
- 047e7a9a: 0.3528 (rank 58,949/145,632) - summary
|
||||
|
||||
**Key Finding:** Target chunks rank 140th-710th, but with limit=10,
|
||||
they never appear. BM25 would find exact keyword match "engineer fail-safe".
|
||||
|
||||
9. **Top Results Analysis**
|
||||
The chunks scoring 0.61 (beating our target) are about:
|
||||
- CloudFlare incident (software failure)
|
||||
- AI safety testing (risk/mitigation mechanisms)
|
||||
- Generic "mechanisms to prevent failure" content
|
||||
|
||||
These are semantically similar to "engineer fail-safe mechanisms"
|
||||
but NOT about humility. Embeddings capture concept, not context.
|
||||
|
||||
10. **Root Cause Confirmed**
|
||||
The fundamental problem is:
|
||||
1. Embeddings capture semantic meaning of query concepts
|
||||
2. Query "engineer fail-safe" embeds as "engineering safety mechanisms"
|
||||
3. Articles specifically about engineering/failure rank higher
|
||||
4. Article about humility (that merely mentions engineers as example) ranks lower
|
||||
5. Only keyword search (BM25) can find "mentioned in passing" content
|
||||
|
||||
11. **Implemented Candidate Pool Multiplier**
|
||||
Added `CANDIDATE_MULTIPLIER = 5` to search.py:
|
||||
- Internal searches now fetch 5x the requested limit
|
||||
- Results from both methods are fused, then top N returned
|
||||
- This helps surface results that rank well in one method but not both
|
||||
|
||||
12. **Added Stopword Filtering to FTS**
|
||||
Updated bm25.py to filter common English stopwords before building tsquery:
|
||||
- Words like "what", "you", "not", "the" are filtered out
|
||||
- This makes AND matching less strict
|
||||
- Query "saying what you mean" becomes "saying:* & mean:*" instead of 8 terms
|
||||
|
||||
13. **Testing: "Taboo Your Words" Query**
|
||||
Query: "saying what you mean not using specific words"
|
||||
Target: Source 735 ("Taboo Your Words" article)
|
||||
|
||||
Results:
|
||||
- Embedding search ranks target at position 21 (score 0.606)
|
||||
- Top 10 results score 0.62-0.64 (about language/communication generally)
|
||||
- FTS doesn't match because article lacks "saying" and "specific"
|
||||
- After fusion: target ranks 23rd, cutoff is 20th
|
||||
|
||||
**Key Insight:** The query describes the *concept* ("not using specific words")
|
||||
but the article is about a *technique* ("taboo your words = replace with definitions").
|
||||
These are semantically adjacent but not equivalent.
|
||||
|
||||
With direct query "replacing words with their definitions" → ranks 1st!
|
||||
|
||||
14. **Testing: "Clowns Iconoclasts" Query**
|
||||
Query: "clowns being the real iconoclasts"
|
||||
Target: "Lonely Dissent" article
|
||||
|
||||
Results: Found at rank 1 with score 0.815 (hybrid boost!)
|
||||
- Both embedding AND FTS match
|
||||
- 0.15 hybrid bonus applied
|
||||
- This is an ideal case where keywords match content
|
||||
|
||||
15. **Remaining Challenges**
|
||||
- "Half-remembered" queries describing concepts vs actual content
|
||||
- Need query expansion (HyDE) to bridge semantic gap
|
||||
- Or return more results for user to scan
|
||||
- Consider showing "You might also be looking for..." suggestions
|
||||
@ -105,9 +105,11 @@ def filter_source_ids(modalities: set[str], filters: SearchFilters) -> list[int]
|
||||
@mcp.tool()
|
||||
async def search_knowledge_base(
|
||||
query: str,
|
||||
filters: SearchFilters,
|
||||
config: SearchConfig = SearchConfig(),
|
||||
filters: SearchFilters = {},
|
||||
modalities: set[str] = set(),
|
||||
limit: int = 20,
|
||||
previews: bool = False,
|
||||
use_scores: bool = False,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Search user's stored content including emails, documents, articles, books.
|
||||
@ -120,22 +122,22 @@ async def search_knowledge_base(
|
||||
Args:
|
||||
query: Natural language search query - be descriptive about what you're looking for
|
||||
modalities: Filter by type: email, blog, book, forum, photo, comic, webpage (empty = all)
|
||||
filters: a dictionary with the following keys:
|
||||
limit: Maximum number of results to return (default 20, max 100). Use higher limits for vague queries.
|
||||
previews: Whether to include the actual content in the results (up to MAX_PREVIEW_LENGTH characters)
|
||||
use_scores: Whether to score the results with an LLM before returning - better results but slower
|
||||
filters: Optional dictionary with:
|
||||
- tags: a list of tags to filter by
|
||||
- source_ids: a list of source ids to filter by
|
||||
- min_size: the minimum size of the content to filter by
|
||||
- max_size: the maximum size of the content to filter by
|
||||
- min_created_at: the minimum created_at date to filter by
|
||||
- max_created_at: the maximum created_at date to filter by
|
||||
config: a dictionary with the following keys:
|
||||
- limit: the maximum number of results to return
|
||||
- previews: whether to include the actual content in the results (up to MAX_PREVIEW_LENGTH characters)
|
||||
- useScores: whether to score the results with a LLM before returning - this results in better results but is slower
|
||||
|
||||
Returns: List of search results with id, score, chunks, content, filename
|
||||
Higher scores (>0.7) indicate strong matches.
|
||||
"""
|
||||
logger.info(f"MCP search for: {query}")
|
||||
config = SearchConfig(limit=min(limit, 100), previews=previews, useScores=use_scores)
|
||||
|
||||
if not modalities:
|
||||
modalities = set(ALL_COLLECTIONS.keys())
|
||||
@ -247,7 +249,7 @@ async def search_observations(
|
||||
tags: list[str] | None = None,
|
||||
observation_types: list[str] | None = None,
|
||||
min_confidences: dict[str, float] = {},
|
||||
config: SearchConfig = SearchConfig(),
|
||||
limit: int = 20,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Search recorded observations about the user.
|
||||
@ -260,12 +262,13 @@ async def search_observations(
|
||||
tags: Filter by tags (must have at least one matching tag)
|
||||
observation_types: Filter by: belief, preference, behavior, contradiction, general
|
||||
min_confidences: Minimum confidence thresholds, e.g. {"observation_accuracy": 0.8}
|
||||
config: SearchConfig
|
||||
limit: Maximum number of results to return (default 20, max 100)
|
||||
|
||||
Returns: List with content, tags, created_at, metadata
|
||||
Results sorted by relevance to your query.
|
||||
"""
|
||||
logger.info("MCP: Searching observations for %s", query)
|
||||
config = SearchConfig(limit=min(limit, 100))
|
||||
semantic_text = observation.generate_semantic_text(
|
||||
subject=subject or "",
|
||||
observation_type="".join(observation_types or []),
|
||||
|
||||
@ -1,32 +1,102 @@
|
||||
"""
|
||||
Search endpoints for the knowledge base API.
|
||||
Full-text search using PostgreSQL's built-in text search capabilities.
|
||||
|
||||
This replaces the previous in-memory BM25 implementation which caused OOM
|
||||
with large collections (250K+ chunks).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from hashlib import sha256
|
||||
import logging
|
||||
import re
|
||||
|
||||
from sqlalchemy import func, text
|
||||
|
||||
import bm25s
|
||||
import Stemmer
|
||||
from memory.api.search.types import SearchFilters
|
||||
|
||||
from memory.common import extract
|
||||
from memory.common.db.connection import make_session
|
||||
from memory.common.db.models import Chunk, ConfidenceScore, SourceItem
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Pattern to remove special characters that confuse tsquery
|
||||
_TSQUERY_SPECIAL_CHARS = re.compile(r"[&|!():*<>'\"-]")
|
||||
|
||||
# Common English stopwords to filter from queries
|
||||
# These are words that appear in most documents and don't help with search relevance
|
||||
_STOPWORDS = frozenset([
|
||||
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
|
||||
"of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
|
||||
"be", "have", "has", "had", "do", "does", "did", "will", "would", "could",
|
||||
"should", "may", "might", "must", "shall", "can", "need", "dare", "ought",
|
||||
"used", "it", "its", "this", "that", "these", "those", "i", "you", "he",
|
||||
"she", "we", "they", "what", "which", "who", "whom", "whose", "where",
|
||||
"when", "why", "how", "all", "each", "every", "both", "few", "more",
|
||||
"most", "other", "some", "such", "no", "nor", "not", "only", "own",
|
||||
"same", "so", "than", "too", "very", "just", "about", "into", "through",
|
||||
"during", "before", "after", "above", "below", "between", "under", "again",
|
||||
"further", "then", "once", "here", "there", "any", "being", "doing",
|
||||
])
|
||||
|
||||
|
||||
def build_tsquery(query: str) -> str:
|
||||
"""
|
||||
Convert a natural language query to a PostgreSQL tsquery.
|
||||
|
||||
Uses AND matching for multi-word queries to ensure all terms appear.
|
||||
Also adds prefix matching with :* for partial word matches.
|
||||
Filters out common stopwords that don't help with search relevance.
|
||||
"""
|
||||
# Remove special characters that confuse tsquery
|
||||
clean_query = _TSQUERY_SPECIAL_CHARS.sub(" ", query)
|
||||
|
||||
# Split query into words, filter stopwords and short words
|
||||
words = [
|
||||
w.strip().lower()
|
||||
for w in clean_query.split()
|
||||
if w.strip() and len(w.strip()) > 2 and w.strip().lower() not in _STOPWORDS
|
||||
]
|
||||
if not words:
|
||||
return ""
|
||||
|
||||
# Join words with & for AND matching (all terms must appear)
|
||||
# Add :* for prefix matching to catch word variants
|
||||
tsquery_parts = [f"{word}:*" for word in words]
|
||||
return " & ".join(tsquery_parts)
|
||||
|
||||
|
||||
async def search_bm25(
|
||||
query: str,
|
||||
modalities: set[str],
|
||||
limit: int = 10,
|
||||
filters: SearchFilters = SearchFilters(),
|
||||
) -> list[str]:
|
||||
) -> dict[str, float]:
|
||||
"""
|
||||
Search chunks using PostgreSQL full-text search.
|
||||
|
||||
Uses ts_rank for relevance scoring, normalized to 0-1 range.
|
||||
|
||||
Returns:
|
||||
- Dictionary mapping chunk IDs to their normalized scores (0-1 range)
|
||||
"""
|
||||
tsquery = build_tsquery(query)
|
||||
if not tsquery:
|
||||
return {}
|
||||
|
||||
with make_session() as db:
|
||||
items_query = db.query(Chunk.id, Chunk.content).filter(
|
||||
# Build the base query with full-text search
|
||||
# ts_rank returns a relevance score based on term frequency
|
||||
rank_expr = func.ts_rank(
|
||||
Chunk.search_vector,
|
||||
func.to_tsquery("english", tsquery),
|
||||
)
|
||||
|
||||
items_query = db.query(
|
||||
Chunk.id,
|
||||
rank_expr.label("rank"),
|
||||
).filter(
|
||||
Chunk.collection_name.in_(modalities),
|
||||
Chunk.content.isnot(None),
|
||||
Chunk.search_vector.isnot(None),
|
||||
Chunk.search_vector.op("@@")(func.to_tsquery("english", tsquery)),
|
||||
)
|
||||
|
||||
# Join with SourceItem if we need size filters
|
||||
@ -61,32 +131,33 @@ async def search_bm25(
|
||||
& (ConfidenceScore.score >= min_score),
|
||||
)
|
||||
|
||||
# Order by rank descending and limit results
|
||||
items_query = items_query.order_by(text("rank DESC")).limit(limit)
|
||||
|
||||
items = items_query.all()
|
||||
if not items:
|
||||
return []
|
||||
return {}
|
||||
|
||||
item_ids = {
|
||||
sha256(item.content.lower().strip().encode("utf-8")).hexdigest(): item.id
|
||||
for item in items
|
||||
if item.content
|
||||
# Collect raw scores
|
||||
raw_scores = {str(item.id): float(item.rank) for item in items if item.rank > 0}
|
||||
|
||||
if not raw_scores:
|
||||
return {}
|
||||
|
||||
# Normalize scores to 0-1 range using min-max normalization
|
||||
# This makes them comparable to embedding cosine similarity scores
|
||||
min_score = min(raw_scores.values())
|
||||
max_score = max(raw_scores.values())
|
||||
score_range = max_score - min_score
|
||||
|
||||
if score_range > 0:
|
||||
return {
|
||||
chunk_id: (score - min_score) / score_range
|
||||
for chunk_id, score in raw_scores.items()
|
||||
}
|
||||
corpus = [item.content.lower().strip() for item in items]
|
||||
|
||||
stemmer = Stemmer.Stemmer("english")
|
||||
corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)
|
||||
retriever = bm25s.BM25()
|
||||
retriever.index(corpus_tokens)
|
||||
|
||||
query_tokens = bm25s.tokenize(query, stemmer=stemmer)
|
||||
results, scores = retriever.retrieve(
|
||||
query_tokens, k=min(limit, len(corpus)), corpus=corpus
|
||||
)
|
||||
|
||||
item_scores = {
|
||||
item_ids[sha256(doc.encode("utf-8")).hexdigest()]: score
|
||||
for doc, score in zip(results[0], scores[0])
|
||||
}
|
||||
return list(item_scores.keys())
|
||||
else:
|
||||
# All scores are equal, return 0.5 for all
|
||||
return {chunk_id: 0.5 for chunk_id in raw_scores}
|
||||
|
||||
|
||||
async def search_bm25_chunks(
|
||||
@ -94,8 +165,14 @@ async def search_bm25_chunks(
|
||||
modalities: set[str] = set(),
|
||||
limit: int = 10,
|
||||
filters: SearchFilters = SearchFilters(),
|
||||
timeout: int = 2,
|
||||
) -> list[str]:
|
||||
timeout: int = 10,
|
||||
) -> dict[str, float]:
|
||||
"""
|
||||
Search chunks using PostgreSQL full-text search.
|
||||
|
||||
Returns:
|
||||
- Dictionary mapping chunk IDs to their normalized scores (0-1 range)
|
||||
"""
|
||||
query = " ".join([c for chunk in data for c in chunk.data if isinstance(c, str)])
|
||||
return await asyncio.wait_for(
|
||||
search_bm25(query, modalities, limit, filters),
|
||||
|
||||
@ -141,18 +141,20 @@ async def search_chunks(
|
||||
min_score: float = 0.3,
|
||||
filters: SearchFilters = {},
|
||||
multimodal: bool = False,
|
||||
) -> list[str]:
|
||||
) -> dict[str, float]:
|
||||
"""
|
||||
Search across knowledge base using text query and optional files.
|
||||
|
||||
Parameters:
|
||||
- data: List of data to search in (e.g., text, images, files)
|
||||
- previews: Whether to include previews in the search results
|
||||
- modalities: List of modalities to search in (e.g., "text", "photo", "doc")
|
||||
- limit: Maximum number of results
|
||||
- min_score: Minimum score to include in the search results
|
||||
- filters: Filters to apply to the search results
|
||||
- multimodal: Whether to search in multimodal collections
|
||||
|
||||
Returns:
|
||||
- Dictionary mapping chunk IDs to their similarity scores
|
||||
"""
|
||||
search_filters = []
|
||||
for key, val in filters.items():
|
||||
@ -170,10 +172,14 @@ async def search_chunks(
|
||||
)
|
||||
search_results = {k: results.get(k, []) for k in modalities}
|
||||
|
||||
found_chunks = {
|
||||
str(r.id): r for results in search_results.values() for r in results
|
||||
}
|
||||
return list(found_chunks.keys())
|
||||
# Return chunk IDs with their scores (take max score if chunk appears multiple times)
|
||||
found_chunks: dict[str, float] = {}
|
||||
for collection_results in search_results.values():
|
||||
for r in collection_results:
|
||||
chunk_id = str(r.id)
|
||||
if chunk_id not in found_chunks or r.score > found_chunks[chunk_id]:
|
||||
found_chunks[chunk_id] = r.score
|
||||
return found_chunks
|
||||
|
||||
|
||||
async def search_chunks_embeddings(
|
||||
@ -182,11 +188,17 @@ async def search_chunks_embeddings(
|
||||
limit: int = 10,
|
||||
filters: SearchFilters = SearchFilters(),
|
||||
timeout: int = 2,
|
||||
) -> list[str]:
|
||||
) -> dict[str, float]:
|
||||
"""
|
||||
Search chunks using embeddings across text and multimodal collections.
|
||||
|
||||
Returns:
|
||||
- Dictionary mapping chunk IDs to their similarity scores
|
||||
"""
|
||||
# Note: Multimodal embeddings typically produce higher similarity scores,
|
||||
# so we use a higher threshold (0.4) to maintain selectivity.
|
||||
# Text embeddings produce lower scores, so we use 0.25.
|
||||
all_ids = await asyncio.gather(
|
||||
all_results = await asyncio.gather(
|
||||
asyncio.wait_for(
|
||||
search_chunks(
|
||||
data,
|
||||
@ -210,4 +222,10 @@ async def search_chunks_embeddings(
|
||||
timeout,
|
||||
),
|
||||
)
|
||||
return list({id for ids in all_ids for id in ids})
|
||||
# Merge scores, taking max if chunk appears in both
|
||||
merged_scores: dict[str, float] = {}
|
||||
for result_dict in all_results:
|
||||
for chunk_id, score in result_dict.items():
|
||||
if chunk_id not in merged_scores or score > merged_scores[chunk_id]:
|
||||
merged_scores[chunk_id] = score
|
||||
return merged_scores
|
||||
|
||||
@ -21,6 +21,59 @@ from memory.api.search.types import SearchConfig, SearchFilters, SearchResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Weight for embedding scores vs BM25 scores in hybrid fusion
|
||||
# Higher values favor semantic similarity over keyword matching
|
||||
EMBEDDING_WEIGHT = 0.7
|
||||
BM25_WEIGHT = 0.3
|
||||
|
||||
# Bonus for results that appear in both embedding and BM25 search
|
||||
# This rewards documents that match both semantically and lexically
|
||||
HYBRID_BONUS = 0.15
|
||||
|
||||
# Multiplier for internal search limit before fusion
|
||||
# We search for more candidates than requested, fuse scores, then return top N
|
||||
# This helps find results that rank well in one method but not the other
|
||||
CANDIDATE_MULTIPLIER = 5
|
||||
|
||||
|
||||
def fuse_scores(
|
||||
embedding_scores: dict[str, float],
|
||||
bm25_scores: dict[str, float],
|
||||
) -> dict[str, float]:
|
||||
"""
|
||||
Fuse embedding and BM25 scores using weighted combination with hybrid bonus.
|
||||
|
||||
Documents appearing in both search results get a bonus, as matching both
|
||||
semantic similarity AND keyword relevance is a strong signal.
|
||||
|
||||
Args:
|
||||
embedding_scores: Dict mapping chunk IDs to embedding similarity scores (0-1)
|
||||
bm25_scores: Dict mapping chunk IDs to normalized BM25 scores (0-1)
|
||||
|
||||
Returns:
|
||||
Dict mapping chunk IDs to fused scores (0-1 range)
|
||||
"""
|
||||
all_ids = set(embedding_scores.keys()) | set(bm25_scores.keys())
|
||||
fused: dict[str, float] = {}
|
||||
|
||||
for chunk_id in all_ids:
|
||||
emb_score = embedding_scores.get(chunk_id, 0.0)
|
||||
bm25_score = bm25_scores.get(chunk_id, 0.0)
|
||||
|
||||
# Check if result appears in both methods
|
||||
in_both = chunk_id in embedding_scores and chunk_id in bm25_scores
|
||||
|
||||
# Weighted combination
|
||||
combined = (EMBEDDING_WEIGHT * emb_score) + (BM25_WEIGHT * bm25_score)
|
||||
|
||||
# Add bonus for appearing in both (strong relevance signal)
|
||||
if in_both:
|
||||
combined = min(1.0, combined + HYBRID_BONUS)
|
||||
|
||||
fused[chunk_id] = combined
|
||||
|
||||
return fused
|
||||
|
||||
|
||||
async def search_chunks(
|
||||
data: list[extract.DataChunk],
|
||||
@ -29,14 +82,40 @@ async def search_chunks(
|
||||
filters: SearchFilters = {},
|
||||
timeout: int = 2,
|
||||
) -> list[Chunk]:
|
||||
funcs = [search_chunks_embeddings]
|
||||
if settings.ENABLE_BM25_SEARCH:
|
||||
funcs.append(search_bm25_chunks)
|
||||
"""
|
||||
Search chunks using embedding similarity and optionally BM25.
|
||||
|
||||
all_ids = await asyncio.gather(
|
||||
*[func(data, modalities, limit, filters, timeout) for func in funcs]
|
||||
Combines results using weighted score fusion, giving bonus to documents
|
||||
that match both semantically and lexically.
|
||||
"""
|
||||
# Search for more candidates than requested, fuse scores, then return top N
|
||||
# This helps find results that rank well in one method but not the other
|
||||
internal_limit = limit * CANDIDATE_MULTIPLIER
|
||||
|
||||
# Run embedding search
|
||||
embedding_scores = await search_chunks_embeddings(
|
||||
data, modalities, internal_limit, filters, timeout
|
||||
)
|
||||
all_ids = {id for ids in all_ids for id in ids}
|
||||
|
||||
# Run BM25 search if enabled
|
||||
bm25_scores: dict[str, float] = {}
|
||||
if settings.ENABLE_BM25_SEARCH:
|
||||
try:
|
||||
bm25_scores = await search_bm25_chunks(
|
||||
data, modalities, internal_limit, filters, timeout
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("BM25 search timed out, using embedding results only")
|
||||
|
||||
# Fuse scores from both methods
|
||||
fused_scores = fuse_scores(embedding_scores, bm25_scores)
|
||||
|
||||
if not fused_scores:
|
||||
return []
|
||||
|
||||
# Sort by score and take top results
|
||||
sorted_ids = sorted(fused_scores.keys(), key=lambda x: fused_scores[x], reverse=True)
|
||||
top_ids = sorted_ids[:limit]
|
||||
|
||||
with make_session() as db:
|
||||
chunks = (
|
||||
@ -49,9 +128,14 @@ async def search_chunks(
|
||||
Chunk.file_paths, # type: ignore
|
||||
)
|
||||
)
|
||||
.filter(Chunk.id.in_(all_ids))
|
||||
.filter(Chunk.id.in_(top_ids))
|
||||
.all()
|
||||
)
|
||||
|
||||
# Set relevance_score on each chunk from the fused scores
|
||||
for chunk in chunks:
|
||||
chunk.relevance_score = fused_scores.get(str(chunk.id), 0.0)
|
||||
|
||||
db.expunge_all()
|
||||
return chunks
|
||||
|
||||
|
||||
@ -41,9 +41,11 @@ class SearchResult(BaseModel):
|
||||
metadata.pop("content", None)
|
||||
chunk_size = settings.DEFAULT_CHUNK_TOKENS * 4
|
||||
|
||||
# Use mean of chunk scores to avoid bias towards documents with more chunks
|
||||
# Use max chunk score - we want to find documents with at least one
|
||||
# highly relevant section, not penalize long documents with some irrelevant parts.
|
||||
# This is better for "half-remembered" searches where users recall one specific detail.
|
||||
search_score = (
|
||||
sum(chunk.relevance_score for chunk in chunks) / len(chunks)
|
||||
max((chunk.relevance_score for chunk in chunks), default=0)
|
||||
if chunks
|
||||
else 0
|
||||
)
|
||||
@ -76,7 +78,7 @@ class SearchFilters(TypedDict):
|
||||
|
||||
|
||||
class SearchConfig(BaseModel):
|
||||
limit: int = 10
|
||||
limit: int = 20
|
||||
timeout: int = 20
|
||||
previews: bool = False
|
||||
useScores: bool = False
|
||||
|
||||
@ -24,6 +24,7 @@ from sqlalchemy import (
|
||||
func,
|
||||
UniqueConstraint,
|
||||
)
|
||||
from sqlalchemy.dialects.postgresql import TSVECTOR
|
||||
from sqlalchemy.dialects.postgresql import BYTEA
|
||||
from sqlalchemy.orm import Session, relationship
|
||||
from sqlalchemy.types import Numeric
|
||||
@ -155,6 +156,7 @@ class Chunk(Base):
|
||||
collection_name = Column(Text)
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
checked_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
search_vector = Column(TSVECTOR) # Full-text search index
|
||||
|
||||
vector: list[float] = []
|
||||
item_metadata: dict[str, Any] = {}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user