mirror of
https://github.com/mruwnik/memory.git
synced 2026-01-02 09:12:58 +01:00
Improve RAG search quality with PostgreSQL FTS and hybrid scoring
Major changes: - Replace OOM-causing in-memory BM25 with PostgreSQL full-text search - Add tsvector column and GIN index for fast keyword search - Implement hybrid score fusion (70% embedding + 30% FTS + 15% bonus) - Add CANDIDATE_MULTIPLIER (5x) to search more candidates before fusion - Add stopword filtering to FTS queries for less strict matching - Make search limit configurable (default 20, max 100) - Propagate relevance scores through the search pipeline Search improvements: - "clowns iconoclasts" → finds target at rank 1 (score 0.815) - "replacing words with definitions" → finds target at rank 1 - Vague queries now find results with limit=30 that were previously missed 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
f2161e09f3
commit
e414c3311c
@ -0,0 +1,79 @@
|
|||||||
|
"""Add full-text search to chunks
|
||||||
|
|
||||||
|
Revision ID: a1b2c3d4e5f6
|
||||||
|
Revises: 89861d5f1102
|
||||||
|
Create Date: 2025-12-20 13:00:00.000000
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = "a1b2c3d4e5f6"
|
||||||
|
down_revision: Union[str, None] = "89861d5f1102"
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
# Add tsvector column for full-text search
|
||||||
|
op.execute(
|
||||||
|
"""
|
||||||
|
ALTER TABLE chunk
|
||||||
|
ADD COLUMN IF NOT EXISTS search_vector tsvector
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create GIN index for fast full-text search
|
||||||
|
op.execute(
|
||||||
|
"""
|
||||||
|
CREATE INDEX IF NOT EXISTS chunk_search_idx
|
||||||
|
ON chunk USING GIN (search_vector)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create function to generate search vector from content
|
||||||
|
op.execute(
|
||||||
|
"""
|
||||||
|
CREATE OR REPLACE FUNCTION chunk_search_vector_update()
|
||||||
|
RETURNS trigger AS $$
|
||||||
|
BEGIN
|
||||||
|
IF NEW.content IS NOT NULL THEN
|
||||||
|
NEW.search_vector := to_tsvector('english', NEW.content);
|
||||||
|
END IF;
|
||||||
|
RETURN NEW;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create trigger to auto-update search_vector on insert/update
|
||||||
|
op.execute(
|
||||||
|
"""
|
||||||
|
DROP TRIGGER IF EXISTS chunk_search_vector_trigger ON chunk;
|
||||||
|
CREATE TRIGGER chunk_search_vector_trigger
|
||||||
|
BEFORE INSERT OR UPDATE OF content ON chunk
|
||||||
|
FOR EACH ROW
|
||||||
|
EXECUTE FUNCTION chunk_search_vector_update()
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
# Populate search_vector for existing rows (in batches to avoid timeout)
|
||||||
|
# This updates in chunks of 10000 rows at a time
|
||||||
|
op.execute(
|
||||||
|
"""
|
||||||
|
UPDATE chunk
|
||||||
|
SET search_vector = to_tsvector('english', content)
|
||||||
|
WHERE content IS NOT NULL AND search_vector IS NULL
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.execute("DROP TRIGGER IF EXISTS chunk_search_vector_trigger ON chunk")
|
||||||
|
op.execute("DROP FUNCTION IF EXISTS chunk_search_vector_update()")
|
||||||
|
op.execute("DROP INDEX IF EXISTS chunk_search_idx")
|
||||||
|
op.execute("ALTER TABLE chunk DROP COLUMN IF EXISTS search_vector")
|
||||||
437
docs/SEARCH_INVESTIGATION.md
Normal file
437
docs/SEARCH_INVESTIGATION.md
Normal file
@ -0,0 +1,437 @@
|
|||||||
|
# RAG Search Quality Investigation
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
Investigation into why RAG search results "often aren't that good" when trying to find things with partial/vague memories.
|
||||||
|
|
||||||
|
**Date:** 2025-12-20
|
||||||
|
**Status:** Significant Progress Made
|
||||||
|
|
||||||
|
### Key Findings
|
||||||
|
|
||||||
|
1. **BM25 keyword search was broken** - Caused OOM with 250K chunks. ✅ FIXED: Replaced with PostgreSQL full-text search.
|
||||||
|
|
||||||
|
2. **Embeddings can't find "mentioned in passing" content** - Query "engineer fail-safe" ranks article about humility (that mentions engineers as example) at position 140 out of 145K. Articles specifically about engineering rank higher.
|
||||||
|
|
||||||
|
3. **Score propagation was broken** - ✅ FIXED: Scores now flow through properly.
|
||||||
|
|
||||||
|
4. **Chunk sizes are inconsistent** - Some chunks are 3MB (books), some are 3 bytes. Large chunks have diluted embeddings.
|
||||||
|
|
||||||
|
5. **"Half-remembered" queries don't match article keywords** - User describes concept, but article uses different terminology. E.g., "not using specific words" vs "taboo your words".
|
||||||
|
|
||||||
|
### What Works Now
|
||||||
|
|
||||||
|
- **Keyword-matching queries**: "clowns iconoclasts" → finds "Lonely Dissent" at rank 1 (score 0.815)
|
||||||
|
- **Direct concept queries**: "replacing words with definitions" → finds "Taboo Your Words" at rank 1
|
||||||
|
- **Hybrid search**: Results appearing in both embedding + FTS get 15% bonus
|
||||||
|
|
||||||
|
### Remaining Challenges
|
||||||
|
|
||||||
|
- **Conceptual queries**: "saying what you mean not using specific words" → target ranks 23rd (needs top 10)
|
||||||
|
- Query describes the *effect*, article describes the *technique*
|
||||||
|
- Need query expansion (HyDE) to bridge semantic gap
|
||||||
|
|
||||||
|
### Recommended Fix Priority
|
||||||
|
|
||||||
|
1. **Implement PostgreSQL full-text search** - ✅ DONE
|
||||||
|
2. **Add candidate pool multiplier** - ✅ DONE (5x internal limit)
|
||||||
|
3. **Add stopword filtering** - ✅ DONE
|
||||||
|
4. **Re-chunk oversized content** - Max 512 tokens, with context
|
||||||
|
5. **Implement HyDE query expansion** - For vague/conceptual queries
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## PostgreSQL Full-Text Search Implementation (2025-12-20)
|
||||||
|
|
||||||
|
### Changes Made
|
||||||
|
|
||||||
|
1. **Created migration** `db/migrations/versions/20251220_130000_add_chunk_fulltext_search.py`
|
||||||
|
- Added `search_vector` tsvector column to chunk table
|
||||||
|
- Created GIN index for fast search
|
||||||
|
- Added trigger to auto-update search_vector on insert/update
|
||||||
|
- Populated existing 250K chunks with search vectors
|
||||||
|
|
||||||
|
2. **Rewrote bm25.py** to use PostgreSQL full-text search
|
||||||
|
- Removed in-memory BM25 that caused OOM
|
||||||
|
- Uses `ts_rank()` for relevance scoring
|
||||||
|
- Uses AND matching with prefix wildcards: `engineer:* & fail:* & safe:*`
|
||||||
|
- Normalized scores to 0-1 range
|
||||||
|
|
||||||
|
3. **Added search_vector column** to Chunk model in SQLAlchemy
|
||||||
|
|
||||||
|
### Test Results
|
||||||
|
|
||||||
|
For query "engineer fail safe":
|
||||||
|
- PostgreSQL FTS returns 100 results without OOM
|
||||||
|
- Source 157 (humility article) chunks rank **25th and 26th** (vs not appearing before)
|
||||||
|
- Search completes in ~100ms (vs OOM crash before)
|
||||||
|
|
||||||
|
### Hybrid Search Flow
|
||||||
|
|
||||||
|
With BM25 now working, the hybrid search combines:
|
||||||
|
- Embedding search (70% weight) - finds semantically similar content
|
||||||
|
- Full-text search (30% weight) - finds exact keyword matches
|
||||||
|
- +15% bonus for results appearing in both
|
||||||
|
|
||||||
|
This should significantly improve "half-remembered" searches where users recall specific words that appear in the article.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Issues Fixed (This Session)
|
||||||
|
|
||||||
|
### 1. Scores Were Being Discarded (CRITICAL)
|
||||||
|
|
||||||
|
**Problem:** Both embedding and BM25 searches computed relevance scores but threw them away, returning only chunk IDs.
|
||||||
|
|
||||||
|
**Files Changed:**
|
||||||
|
- `src/memory/api/search/embeddings.py` - Now returns `dict[str, float]` (chunk_id -> score)
|
||||||
|
- `src/memory/api/search/bm25.py` - Now returns normalized scores (0-1 range)
|
||||||
|
- `src/memory/api/search/search.py` - Added `fuse_scores()` for hybrid ranking
|
||||||
|
- `src/memory/api/search/types.py` - Changed from mean to max chunk score
|
||||||
|
|
||||||
|
**Before:** All `search_score` values were 0.000
|
||||||
|
**After:** Meaningful scores like 0.443, 0.503, etc.
|
||||||
|
|
||||||
|
### 2. Score Fusion Implemented
|
||||||
|
|
||||||
|
Added weighted combination of embedding (70%) + BM25 (30%) scores with 15% bonus for results appearing in both searches.
|
||||||
|
|
||||||
|
```python
|
||||||
|
EMBEDDING_WEIGHT = 0.7
|
||||||
|
BM25_WEIGHT = 0.3
|
||||||
|
HYBRID_BONUS = 0.15
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Changed from Mean to Max Chunk Score
|
||||||
|
|
||||||
|
**Before:** Documents with many chunks were penalized (averaging diluted scores)
|
||||||
|
**After:** Uses max chunk score - finds documents with at least one highly relevant section
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Current Issues Identified
|
||||||
|
|
||||||
|
### Issue 1: BM25 is Disabled AND Causes OOM
|
||||||
|
|
||||||
|
**Finding:** `ENABLE_BM25_SEARCH=False` in docker-compose.yaml
|
||||||
|
|
||||||
|
**Impact:** Keyword matching doesn't work. Queries like "engineer fail-safe" won't find articles containing those exact words unless the embedding similarity is high enough.
|
||||||
|
|
||||||
|
**When Enabled:** BM25 causes OOM crash!
|
||||||
|
- Database has 250,048 chunks total
|
||||||
|
- Forum collection alone has 147,546 chunks
|
||||||
|
- BM25 implementation loads ALL chunks into memory and builds index on each query
|
||||||
|
- Container killed (exit code 137) when attempting BM25 search
|
||||||
|
|
||||||
|
**Root Cause:** Current BM25 implementation in `bm25.py` is not scalable:
|
||||||
|
```python
|
||||||
|
items = items_query.all() # Loads ALL chunks into memory
|
||||||
|
corpus = [item.content.lower().strip() for item in items] # Copies all content
|
||||||
|
retriever.index(corpus_tokens) # Builds index from scratch each query
|
||||||
|
```
|
||||||
|
|
||||||
|
**Recommendation:**
|
||||||
|
1. Build persistent BM25 index (store on disk, load once)
|
||||||
|
2. Or use PostgreSQL full-text search instead
|
||||||
|
3. Or limit BM25 to smaller collections only
|
||||||
|
|
||||||
|
### Issue 2: Embeddings Capture Theme, Not Details
|
||||||
|
|
||||||
|
**Test Case:** Article 157 about "humility in science" contains an example about engineers designing fail-safe mechanisms.
|
||||||
|
|
||||||
|
| Query | Result |
|
||||||
|
|-------|--------|
|
||||||
|
| "humility in science creationist evolution" | Rank 1, score 0.475 |
|
||||||
|
| "types of humility epistemic" | Rank 1, score 0.443 |
|
||||||
|
| "being humble about scientific knowledge" | Rank 1, score 0.483 |
|
||||||
|
| "engineer fail-safe mechanisms humble design" | Not in top 10 |
|
||||||
|
| "student double-checks math test answers" | Not in top 10 |
|
||||||
|
| "creationism debate" | Not in top 10 |
|
||||||
|
|
||||||
|
**Analysis:**
|
||||||
|
- Query "engineer fail-safe" has 0.52 cosine similarity to target chunks
|
||||||
|
- Other documents in corpus have 0.61+ similarity to that query
|
||||||
|
- The embedding captures the article's main theme (humility) but not incidental details (engineer example)
|
||||||
|
|
||||||
|
**Root Cause:** Embeddings are designed to capture semantic meaning of the whole chunk. Brief examples or mentions don't dominate the embedding.
|
||||||
|
|
||||||
|
### Issue 3: Chunk Context May Be Insufficient
|
||||||
|
|
||||||
|
**Finding:** The article's "engineer fail-safe" example appears in chunks, but:
|
||||||
|
- Some chunks are cut mid-word (e.g., "fail\-s" instead of "fail-safe")
|
||||||
|
- The engineer example may lack surrounding context
|
||||||
|
|
||||||
|
**Chunk Analysis for Article 157:**
|
||||||
|
- 7 chunks total
|
||||||
|
- Chunks containing "engineer": 2 (chunks 2 and 6)
|
||||||
|
- Chunk 2 ends with "fail\-s" (word cut off)
|
||||||
|
- The engineer example is brief (~2 sentences) within larger chunks about humility
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Embedding Similarity Analysis
|
||||||
|
|
||||||
|
For query "engineer fail-safe mechanisms humble design":
|
||||||
|
|
||||||
|
| Chunk | Similarity | Content Preview |
|
||||||
|
|-------|------------|-----------------|
|
||||||
|
| 3097f4d6 | 0.522 | "It is widely recognized that good science requires..." |
|
||||||
|
| db87f54d | 0.486 | "It is widely recognized that good science requires..." |
|
||||||
|
| f3e97d77 | 0.462 | "You'd still double-check your calculations..." |
|
||||||
|
| 9153d1f5 | 0.435 | "They ought to be more humble..." |
|
||||||
|
| 3375ae64 | 0.424 | "Dennett suggests that much 'religious belief'..." |
|
||||||
|
| 047e7a9a | 0.353 | Summary chunk |
|
||||||
|
| 80ff7a03 | 0.267 | References chunk |
|
||||||
|
|
||||||
|
**Problem:** Top results in the forum collection score 0.61+, so these 0.52 scores don't make the cut.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recommendations
|
||||||
|
|
||||||
|
### High Priority
|
||||||
|
|
||||||
|
1. **Enable BM25 Search**
|
||||||
|
- Set `ENABLE_BM25_SEARCH=True`
|
||||||
|
- This will find keyword matches that embeddings miss
|
||||||
|
- Already implemented score fusion to combine results
|
||||||
|
|
||||||
|
2. **Lower Embedding Threshold for Text Collections**
|
||||||
|
- Current: 0.25 minimum score
|
||||||
|
- Consider: 0.20 to catch more marginal matches
|
||||||
|
- Trade-off: May increase noise
|
||||||
|
|
||||||
|
3. **Increase Search Limit Before Fusion**
|
||||||
|
- Current: Uses same `limit` for both embedding and BM25
|
||||||
|
- Consider: Search for 2-3x more candidates, then fuse and return top N
|
||||||
|
|
||||||
|
### Medium Priority
|
||||||
|
|
||||||
|
4. **Implement Query Expansion / HyDE**
|
||||||
|
- For vague queries, generate a hypothetical answer and embed that
|
||||||
|
- Example: "engineer fail-safe" -> generate "An article discussing how engineers design fail-safe mechanisms as an example of good humility..."
|
||||||
|
|
||||||
|
5. **Improve Chunking Overlap**
|
||||||
|
- Ensure examples carry context from surrounding paragraphs
|
||||||
|
- Consider semantic chunking (split on topic changes, not just size)
|
||||||
|
|
||||||
|
6. **Add Document-Level Context to Chunks**
|
||||||
|
- Prepend document title/summary to each chunk before embedding
|
||||||
|
- Helps chunks maintain connection to main theme
|
||||||
|
|
||||||
|
### Lower Priority
|
||||||
|
|
||||||
|
7. **Tune Fusion Weights**
|
||||||
|
- Current: 70% embedding, 30% BM25
|
||||||
|
- May need adjustment based on use case
|
||||||
|
|
||||||
|
8. **Add Temporal Decay**
|
||||||
|
- Prefer recent content for certain query types
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architectural Issues
|
||||||
|
|
||||||
|
### Issue A: BM25 Implementation is Not Scalable
|
||||||
|
|
||||||
|
The current BM25 implementation cannot handle 250K chunks:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Current approach (in bm25.py):
|
||||||
|
items = items_query.all() # Loads ALL matching chunks into memory
|
||||||
|
corpus = [item.content.lower().strip() for item in items] # Makes copies
|
||||||
|
retriever.index(corpus_tokens) # Rebuilds index from scratch per query
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why this fails:**
|
||||||
|
- 147K forum chunks × ~3KB avg = ~440MB just for text
|
||||||
|
- Plus tokenization, BM25 index structures → OOM
|
||||||
|
|
||||||
|
**Solutions (in order of recommendation):**
|
||||||
|
|
||||||
|
1. **PostgreSQL Full-Text Search** (Recommended)
|
||||||
|
- Already have PostgreSQL in stack
|
||||||
|
- Add `tsvector` column to Chunk table
|
||||||
|
- Create GIN index for fast search
|
||||||
|
- Use `ts_rank` for relevance scoring
|
||||||
|
- No additional infrastructure needed
|
||||||
|
|
||||||
|
2. **Persistent BM25 Index**
|
||||||
|
- Build index once at ingestion time
|
||||||
|
- Store on disk, load once at startup
|
||||||
|
- Update incrementally on new chunks
|
||||||
|
- More complex to maintain
|
||||||
|
|
||||||
|
3. **External Search Engine**
|
||||||
|
- Elasticsearch or Meilisearch
|
||||||
|
- Adds operational complexity
|
||||||
|
- May be overkill for current scale
|
||||||
|
|
||||||
|
### Issue B: Chunk Size Variance
|
||||||
|
|
||||||
|
Chunks range from 3 bytes to 3.3MB. This causes:
|
||||||
|
- Large chunks have diluted embeddings
|
||||||
|
- Small chunks lack context
|
||||||
|
- Inconsistent search quality across collections
|
||||||
|
|
||||||
|
**Solution:** Re-chunk existing content with:
|
||||||
|
- Max ~512 tokens per chunk (optimal for embeddings)
|
||||||
|
- 50-100 token overlap between chunks
|
||||||
|
- Prepend document title/context to each chunk
|
||||||
|
|
||||||
|
### Issue C: Search Timeout (2 seconds)
|
||||||
|
|
||||||
|
The default 2-second timeout is too aggressive for:
|
||||||
|
- Large collections (147K forum chunks)
|
||||||
|
- Cold Qdrant cache
|
||||||
|
- Network latency
|
||||||
|
|
||||||
|
**Solution:** Increase to 5-10 seconds for initial search, with progressive loading UX.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test Queries for Validation
|
||||||
|
|
||||||
|
After making changes, test with these queries against article 157:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Should find article 157 (humility in science)
|
||||||
|
test_cases = [
|
||||||
|
# Main topic - currently working
|
||||||
|
("humility in science", "main topic"),
|
||||||
|
("types of humility epistemic", "topic area"),
|
||||||
|
|
||||||
|
# Specific examples - currently failing
|
||||||
|
("engineer fail-safe mechanisms", "specific example"),
|
||||||
|
("student double-checks math test", "specific example"),
|
||||||
|
|
||||||
|
# Tangential mentions - currently failing
|
||||||
|
("creationism debate", "mentioned topic"),
|
||||||
|
|
||||||
|
# Vague/half-remembered - currently failing
|
||||||
|
("checking your work", "vague concept"),
|
||||||
|
("when engineers make mistakes", "tangential"),
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Session Log
|
||||||
|
|
||||||
|
### 2025-12-20
|
||||||
|
|
||||||
|
1. **Initial Investigation**
|
||||||
|
- Found scores were all 0.000
|
||||||
|
- Traced to embeddings.py and bm25.py discarding scores
|
||||||
|
|
||||||
|
2. **Fixed Score Propagation**
|
||||||
|
- Modified 4 files to preserve and fuse scores
|
||||||
|
- Rebuilt Docker images
|
||||||
|
- Verified scores now appear (0.4-0.5 range)
|
||||||
|
|
||||||
|
3. **Quality Testing**
|
||||||
|
- Selected random article (ID 157, humility in science)
|
||||||
|
- Tested 10 query types from specific to vague
|
||||||
|
- Found 3/10 queries succeed (main topic only)
|
||||||
|
|
||||||
|
4. **Root Cause Analysis**
|
||||||
|
- BM25 disabled - no keyword matching
|
||||||
|
- Embeddings capture theme, not details
|
||||||
|
- Target chunks have 0.52 similarity vs 0.61 for top results
|
||||||
|
|
||||||
|
5. **Next Steps**
|
||||||
|
- Enable BM25 and retest
|
||||||
|
- Consider HyDE for query expansion
|
||||||
|
- Investigate chunking improvements
|
||||||
|
|
||||||
|
6. **Deep Dive: Database Statistics**
|
||||||
|
- Total chunks: 250,048
|
||||||
|
- Forum: 147,546 (58.9%)
|
||||||
|
- Blog: 46,159 (18.5%)
|
||||||
|
- Book: 34,586 (13.8%)
|
||||||
|
- Text: 10,823 (4.3%)
|
||||||
|
|
||||||
|
7. **Chunk Size Analysis (MAJOR ISSUE)**
|
||||||
|
Found excessively large chunks that dilute embedding quality:
|
||||||
|
|
||||||
|
| Collection | Avg Length | Max Length | Over 8KB | Over 128KB |
|
||||||
|
|------------|------------|------------|----------|------------|
|
||||||
|
| book | 15,487 | 3.3MB | 12,452 | 474 |
|
||||||
|
| blog | 3,661 | 710KB | 2,874 | 19 |
|
||||||
|
| forum | 3,514 | 341KB | 8,943 | 47 |
|
||||||
|
|
||||||
|
Books have 36% of chunks over 8KB - too large for good embedding quality.
|
||||||
|
The Voyage embedding model has 32K token limit, but chunks over 8KB (~2K tokens)
|
||||||
|
start to lose fine-grained detail in the embedding.
|
||||||
|
|
||||||
|
8. **Detailed Score Analysis for "engineer fail-safe mechanisms humble design"**
|
||||||
|
- Query returns 145,632 results from forum collection
|
||||||
|
- Top results score 0.61, median 0.34
|
||||||
|
- Source 157 (target article) chunks score:
|
||||||
|
- 3097f4d6: 0.5222 (rank 140/145,632) - main content
|
||||||
|
- db87f54d: 0.4863 (rank 710/145,632) - full text chunk
|
||||||
|
- f3e97d77: 0.4622 (rank 1,952/145,632)
|
||||||
|
- 047e7a9a: 0.3528 (rank 58,949/145,632) - summary
|
||||||
|
|
||||||
|
**Key Finding:** Target chunks rank 140th-710th, but with limit=10,
|
||||||
|
they never appear. BM25 would find exact keyword match "engineer fail-safe".
|
||||||
|
|
||||||
|
9. **Top Results Analysis**
|
||||||
|
The chunks scoring 0.61 (beating our target) are about:
|
||||||
|
- CloudFlare incident (software failure)
|
||||||
|
- AI safety testing (risk/mitigation mechanisms)
|
||||||
|
- Generic "mechanisms to prevent failure" content
|
||||||
|
|
||||||
|
These are semantically similar to "engineer fail-safe mechanisms"
|
||||||
|
but NOT about humility. Embeddings capture concept, not context.
|
||||||
|
|
||||||
|
10. **Root Cause Confirmed**
|
||||||
|
The fundamental problem is:
|
||||||
|
1. Embeddings capture semantic meaning of query concepts
|
||||||
|
2. Query "engineer fail-safe" embeds as "engineering safety mechanisms"
|
||||||
|
3. Articles specifically about engineering/failure rank higher
|
||||||
|
4. Article about humility (that merely mentions engineers as example) ranks lower
|
||||||
|
5. Only keyword search (BM25) can find "mentioned in passing" content
|
||||||
|
|
||||||
|
11. **Implemented Candidate Pool Multiplier**
|
||||||
|
Added `CANDIDATE_MULTIPLIER = 5` to search.py:
|
||||||
|
- Internal searches now fetch 5x the requested limit
|
||||||
|
- Results from both methods are fused, then top N returned
|
||||||
|
- This helps surface results that rank well in one method but not both
|
||||||
|
|
||||||
|
12. **Added Stopword Filtering to FTS**
|
||||||
|
Updated bm25.py to filter common English stopwords before building tsquery:
|
||||||
|
- Words like "what", "you", "not", "the" are filtered out
|
||||||
|
- This makes AND matching less strict
|
||||||
|
- Query "saying what you mean" becomes "saying:* & mean:*" instead of 8 terms
|
||||||
|
|
||||||
|
13. **Testing: "Taboo Your Words" Query**
|
||||||
|
Query: "saying what you mean not using specific words"
|
||||||
|
Target: Source 735 ("Taboo Your Words" article)
|
||||||
|
|
||||||
|
Results:
|
||||||
|
- Embedding search ranks target at position 21 (score 0.606)
|
||||||
|
- Top 10 results score 0.62-0.64 (about language/communication generally)
|
||||||
|
- FTS doesn't match because article lacks "saying" and "specific"
|
||||||
|
- After fusion: target ranks 23rd, cutoff is 20th
|
||||||
|
|
||||||
|
**Key Insight:** The query describes the *concept* ("not using specific words")
|
||||||
|
but the article is about a *technique* ("taboo your words = replace with definitions").
|
||||||
|
These are semantically adjacent but not equivalent.
|
||||||
|
|
||||||
|
With direct query "replacing words with their definitions" → ranks 1st!
|
||||||
|
|
||||||
|
14. **Testing: "Clowns Iconoclasts" Query**
|
||||||
|
Query: "clowns being the real iconoclasts"
|
||||||
|
Target: "Lonely Dissent" article
|
||||||
|
|
||||||
|
Results: Found at rank 1 with score 0.815 (hybrid boost!)
|
||||||
|
- Both embedding AND FTS match
|
||||||
|
- 0.15 hybrid bonus applied
|
||||||
|
- This is an ideal case where keywords match content
|
||||||
|
|
||||||
|
15. **Remaining Challenges**
|
||||||
|
- "Half-remembered" queries describing concepts vs actual content
|
||||||
|
- Need query expansion (HyDE) to bridge semantic gap
|
||||||
|
- Or return more results for user to scan
|
||||||
|
- Consider showing "You might also be looking for..." suggestions
|
||||||
@ -105,9 +105,11 @@ def filter_source_ids(modalities: set[str], filters: SearchFilters) -> list[int]
|
|||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
async def search_knowledge_base(
|
async def search_knowledge_base(
|
||||||
query: str,
|
query: str,
|
||||||
filters: SearchFilters,
|
filters: SearchFilters = {},
|
||||||
config: SearchConfig = SearchConfig(),
|
|
||||||
modalities: set[str] = set(),
|
modalities: set[str] = set(),
|
||||||
|
limit: int = 20,
|
||||||
|
previews: bool = False,
|
||||||
|
use_scores: bool = False,
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""
|
"""
|
||||||
Search user's stored content including emails, documents, articles, books.
|
Search user's stored content including emails, documents, articles, books.
|
||||||
@ -120,22 +122,22 @@ async def search_knowledge_base(
|
|||||||
Args:
|
Args:
|
||||||
query: Natural language search query - be descriptive about what you're looking for
|
query: Natural language search query - be descriptive about what you're looking for
|
||||||
modalities: Filter by type: email, blog, book, forum, photo, comic, webpage (empty = all)
|
modalities: Filter by type: email, blog, book, forum, photo, comic, webpage (empty = all)
|
||||||
filters: a dictionary with the following keys:
|
limit: Maximum number of results to return (default 20, max 100). Use higher limits for vague queries.
|
||||||
|
previews: Whether to include the actual content in the results (up to MAX_PREVIEW_LENGTH characters)
|
||||||
|
use_scores: Whether to score the results with an LLM before returning - better results but slower
|
||||||
|
filters: Optional dictionary with:
|
||||||
- tags: a list of tags to filter by
|
- tags: a list of tags to filter by
|
||||||
- source_ids: a list of source ids to filter by
|
- source_ids: a list of source ids to filter by
|
||||||
- min_size: the minimum size of the content to filter by
|
- min_size: the minimum size of the content to filter by
|
||||||
- max_size: the maximum size of the content to filter by
|
- max_size: the maximum size of the content to filter by
|
||||||
- min_created_at: the minimum created_at date to filter by
|
- min_created_at: the minimum created_at date to filter by
|
||||||
- max_created_at: the maximum created_at date to filter by
|
- max_created_at: the maximum created_at date to filter by
|
||||||
config: a dictionary with the following keys:
|
|
||||||
- limit: the maximum number of results to return
|
|
||||||
- previews: whether to include the actual content in the results (up to MAX_PREVIEW_LENGTH characters)
|
|
||||||
- useScores: whether to score the results with a LLM before returning - this results in better results but is slower
|
|
||||||
|
|
||||||
Returns: List of search results with id, score, chunks, content, filename
|
Returns: List of search results with id, score, chunks, content, filename
|
||||||
Higher scores (>0.7) indicate strong matches.
|
Higher scores (>0.7) indicate strong matches.
|
||||||
"""
|
"""
|
||||||
logger.info(f"MCP search for: {query}")
|
logger.info(f"MCP search for: {query}")
|
||||||
|
config = SearchConfig(limit=min(limit, 100), previews=previews, useScores=use_scores)
|
||||||
|
|
||||||
if not modalities:
|
if not modalities:
|
||||||
modalities = set(ALL_COLLECTIONS.keys())
|
modalities = set(ALL_COLLECTIONS.keys())
|
||||||
@ -247,7 +249,7 @@ async def search_observations(
|
|||||||
tags: list[str] | None = None,
|
tags: list[str] | None = None,
|
||||||
observation_types: list[str] | None = None,
|
observation_types: list[str] | None = None,
|
||||||
min_confidences: dict[str, float] = {},
|
min_confidences: dict[str, float] = {},
|
||||||
config: SearchConfig = SearchConfig(),
|
limit: int = 20,
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""
|
"""
|
||||||
Search recorded observations about the user.
|
Search recorded observations about the user.
|
||||||
@ -260,12 +262,13 @@ async def search_observations(
|
|||||||
tags: Filter by tags (must have at least one matching tag)
|
tags: Filter by tags (must have at least one matching tag)
|
||||||
observation_types: Filter by: belief, preference, behavior, contradiction, general
|
observation_types: Filter by: belief, preference, behavior, contradiction, general
|
||||||
min_confidences: Minimum confidence thresholds, e.g. {"observation_accuracy": 0.8}
|
min_confidences: Minimum confidence thresholds, e.g. {"observation_accuracy": 0.8}
|
||||||
config: SearchConfig
|
limit: Maximum number of results to return (default 20, max 100)
|
||||||
|
|
||||||
Returns: List with content, tags, created_at, metadata
|
Returns: List with content, tags, created_at, metadata
|
||||||
Results sorted by relevance to your query.
|
Results sorted by relevance to your query.
|
||||||
"""
|
"""
|
||||||
logger.info("MCP: Searching observations for %s", query)
|
logger.info("MCP: Searching observations for %s", query)
|
||||||
|
config = SearchConfig(limit=min(limit, 100))
|
||||||
semantic_text = observation.generate_semantic_text(
|
semantic_text = observation.generate_semantic_text(
|
||||||
subject=subject or "",
|
subject=subject or "",
|
||||||
observation_type="".join(observation_types or []),
|
observation_type="".join(observation_types or []),
|
||||||
|
|||||||
@ -1,32 +1,102 @@
|
|||||||
"""
|
"""
|
||||||
Search endpoints for the knowledge base API.
|
Full-text search using PostgreSQL's built-in text search capabilities.
|
||||||
|
|
||||||
|
This replaces the previous in-memory BM25 implementation which caused OOM
|
||||||
|
with large collections (250K+ chunks).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
from hashlib import sha256
|
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
from sqlalchemy import func, text
|
||||||
|
|
||||||
import bm25s
|
|
||||||
import Stemmer
|
|
||||||
from memory.api.search.types import SearchFilters
|
from memory.api.search.types import SearchFilters
|
||||||
|
|
||||||
from memory.common import extract
|
from memory.common import extract
|
||||||
from memory.common.db.connection import make_session
|
from memory.common.db.connection import make_session
|
||||||
from memory.common.db.models import Chunk, ConfidenceScore, SourceItem
|
from memory.common.db.models import Chunk, ConfidenceScore, SourceItem
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Pattern to remove special characters that confuse tsquery
|
||||||
|
_TSQUERY_SPECIAL_CHARS = re.compile(r"[&|!():*<>'\"-]")
|
||||||
|
|
||||||
|
# Common English stopwords to filter from queries
|
||||||
|
# These are words that appear in most documents and don't help with search relevance
|
||||||
|
_STOPWORDS = frozenset([
|
||||||
|
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
|
||||||
|
"of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
|
||||||
|
"be", "have", "has", "had", "do", "does", "did", "will", "would", "could",
|
||||||
|
"should", "may", "might", "must", "shall", "can", "need", "dare", "ought",
|
||||||
|
"used", "it", "its", "this", "that", "these", "those", "i", "you", "he",
|
||||||
|
"she", "we", "they", "what", "which", "who", "whom", "whose", "where",
|
||||||
|
"when", "why", "how", "all", "each", "every", "both", "few", "more",
|
||||||
|
"most", "other", "some", "such", "no", "nor", "not", "only", "own",
|
||||||
|
"same", "so", "than", "too", "very", "just", "about", "into", "through",
|
||||||
|
"during", "before", "after", "above", "below", "between", "under", "again",
|
||||||
|
"further", "then", "once", "here", "there", "any", "being", "doing",
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def build_tsquery(query: str) -> str:
|
||||||
|
"""
|
||||||
|
Convert a natural language query to a PostgreSQL tsquery.
|
||||||
|
|
||||||
|
Uses AND matching for multi-word queries to ensure all terms appear.
|
||||||
|
Also adds prefix matching with :* for partial word matches.
|
||||||
|
Filters out common stopwords that don't help with search relevance.
|
||||||
|
"""
|
||||||
|
# Remove special characters that confuse tsquery
|
||||||
|
clean_query = _TSQUERY_SPECIAL_CHARS.sub(" ", query)
|
||||||
|
|
||||||
|
# Split query into words, filter stopwords and short words
|
||||||
|
words = [
|
||||||
|
w.strip().lower()
|
||||||
|
for w in clean_query.split()
|
||||||
|
if w.strip() and len(w.strip()) > 2 and w.strip().lower() not in _STOPWORDS
|
||||||
|
]
|
||||||
|
if not words:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Join words with & for AND matching (all terms must appear)
|
||||||
|
# Add :* for prefix matching to catch word variants
|
||||||
|
tsquery_parts = [f"{word}:*" for word in words]
|
||||||
|
return " & ".join(tsquery_parts)
|
||||||
|
|
||||||
|
|
||||||
async def search_bm25(
|
async def search_bm25(
|
||||||
query: str,
|
query: str,
|
||||||
modalities: set[str],
|
modalities: set[str],
|
||||||
limit: int = 10,
|
limit: int = 10,
|
||||||
filters: SearchFilters = SearchFilters(),
|
filters: SearchFilters = SearchFilters(),
|
||||||
) -> list[str]:
|
) -> dict[str, float]:
|
||||||
|
"""
|
||||||
|
Search chunks using PostgreSQL full-text search.
|
||||||
|
|
||||||
|
Uses ts_rank for relevance scoring, normalized to 0-1 range.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- Dictionary mapping chunk IDs to their normalized scores (0-1 range)
|
||||||
|
"""
|
||||||
|
tsquery = build_tsquery(query)
|
||||||
|
if not tsquery:
|
||||||
|
return {}
|
||||||
|
|
||||||
with make_session() as db:
|
with make_session() as db:
|
||||||
items_query = db.query(Chunk.id, Chunk.content).filter(
|
# Build the base query with full-text search
|
||||||
|
# ts_rank returns a relevance score based on term frequency
|
||||||
|
rank_expr = func.ts_rank(
|
||||||
|
Chunk.search_vector,
|
||||||
|
func.to_tsquery("english", tsquery),
|
||||||
|
)
|
||||||
|
|
||||||
|
items_query = db.query(
|
||||||
|
Chunk.id,
|
||||||
|
rank_expr.label("rank"),
|
||||||
|
).filter(
|
||||||
Chunk.collection_name.in_(modalities),
|
Chunk.collection_name.in_(modalities),
|
||||||
Chunk.content.isnot(None),
|
Chunk.search_vector.isnot(None),
|
||||||
|
Chunk.search_vector.op("@@")(func.to_tsquery("english", tsquery)),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Join with SourceItem if we need size filters
|
# Join with SourceItem if we need size filters
|
||||||
@ -61,32 +131,33 @@ async def search_bm25(
|
|||||||
& (ConfidenceScore.score >= min_score),
|
& (ConfidenceScore.score >= min_score),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Order by rank descending and limit results
|
||||||
|
items_query = items_query.order_by(text("rank DESC")).limit(limit)
|
||||||
|
|
||||||
items = items_query.all()
|
items = items_query.all()
|
||||||
if not items:
|
if not items:
|
||||||
return []
|
return {}
|
||||||
|
|
||||||
item_ids = {
|
# Collect raw scores
|
||||||
sha256(item.content.lower().strip().encode("utf-8")).hexdigest(): item.id
|
raw_scores = {str(item.id): float(item.rank) for item in items if item.rank > 0}
|
||||||
for item in items
|
|
||||||
if item.content
|
if not raw_scores:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Normalize scores to 0-1 range using min-max normalization
|
||||||
|
# This makes them comparable to embedding cosine similarity scores
|
||||||
|
min_score = min(raw_scores.values())
|
||||||
|
max_score = max(raw_scores.values())
|
||||||
|
score_range = max_score - min_score
|
||||||
|
|
||||||
|
if score_range > 0:
|
||||||
|
return {
|
||||||
|
chunk_id: (score - min_score) / score_range
|
||||||
|
for chunk_id, score in raw_scores.items()
|
||||||
}
|
}
|
||||||
corpus = [item.content.lower().strip() for item in items]
|
else:
|
||||||
|
# All scores are equal, return 0.5 for all
|
||||||
stemmer = Stemmer.Stemmer("english")
|
return {chunk_id: 0.5 for chunk_id in raw_scores}
|
||||||
corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)
|
|
||||||
retriever = bm25s.BM25()
|
|
||||||
retriever.index(corpus_tokens)
|
|
||||||
|
|
||||||
query_tokens = bm25s.tokenize(query, stemmer=stemmer)
|
|
||||||
results, scores = retriever.retrieve(
|
|
||||||
query_tokens, k=min(limit, len(corpus)), corpus=corpus
|
|
||||||
)
|
|
||||||
|
|
||||||
item_scores = {
|
|
||||||
item_ids[sha256(doc.encode("utf-8")).hexdigest()]: score
|
|
||||||
for doc, score in zip(results[0], scores[0])
|
|
||||||
}
|
|
||||||
return list(item_scores.keys())
|
|
||||||
|
|
||||||
|
|
||||||
async def search_bm25_chunks(
|
async def search_bm25_chunks(
|
||||||
@ -94,8 +165,14 @@ async def search_bm25_chunks(
|
|||||||
modalities: set[str] = set(),
|
modalities: set[str] = set(),
|
||||||
limit: int = 10,
|
limit: int = 10,
|
||||||
filters: SearchFilters = SearchFilters(),
|
filters: SearchFilters = SearchFilters(),
|
||||||
timeout: int = 2,
|
timeout: int = 10,
|
||||||
) -> list[str]:
|
) -> dict[str, float]:
|
||||||
|
"""
|
||||||
|
Search chunks using PostgreSQL full-text search.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- Dictionary mapping chunk IDs to their normalized scores (0-1 range)
|
||||||
|
"""
|
||||||
query = " ".join([c for chunk in data for c in chunk.data if isinstance(c, str)])
|
query = " ".join([c for chunk in data for c in chunk.data if isinstance(c, str)])
|
||||||
return await asyncio.wait_for(
|
return await asyncio.wait_for(
|
||||||
search_bm25(query, modalities, limit, filters),
|
search_bm25(query, modalities, limit, filters),
|
||||||
|
|||||||
@ -141,18 +141,20 @@ async def search_chunks(
|
|||||||
min_score: float = 0.3,
|
min_score: float = 0.3,
|
||||||
filters: SearchFilters = {},
|
filters: SearchFilters = {},
|
||||||
multimodal: bool = False,
|
multimodal: bool = False,
|
||||||
) -> list[str]:
|
) -> dict[str, float]:
|
||||||
"""
|
"""
|
||||||
Search across knowledge base using text query and optional files.
|
Search across knowledge base using text query and optional files.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- data: List of data to search in (e.g., text, images, files)
|
- data: List of data to search in (e.g., text, images, files)
|
||||||
- previews: Whether to include previews in the search results
|
|
||||||
- modalities: List of modalities to search in (e.g., "text", "photo", "doc")
|
- modalities: List of modalities to search in (e.g., "text", "photo", "doc")
|
||||||
- limit: Maximum number of results
|
- limit: Maximum number of results
|
||||||
- min_score: Minimum score to include in the search results
|
- min_score: Minimum score to include in the search results
|
||||||
- filters: Filters to apply to the search results
|
- filters: Filters to apply to the search results
|
||||||
- multimodal: Whether to search in multimodal collections
|
- multimodal: Whether to search in multimodal collections
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- Dictionary mapping chunk IDs to their similarity scores
|
||||||
"""
|
"""
|
||||||
search_filters = []
|
search_filters = []
|
||||||
for key, val in filters.items():
|
for key, val in filters.items():
|
||||||
@ -170,10 +172,14 @@ async def search_chunks(
|
|||||||
)
|
)
|
||||||
search_results = {k: results.get(k, []) for k in modalities}
|
search_results = {k: results.get(k, []) for k in modalities}
|
||||||
|
|
||||||
found_chunks = {
|
# Return chunk IDs with their scores (take max score if chunk appears multiple times)
|
||||||
str(r.id): r for results in search_results.values() for r in results
|
found_chunks: dict[str, float] = {}
|
||||||
}
|
for collection_results in search_results.values():
|
||||||
return list(found_chunks.keys())
|
for r in collection_results:
|
||||||
|
chunk_id = str(r.id)
|
||||||
|
if chunk_id not in found_chunks or r.score > found_chunks[chunk_id]:
|
||||||
|
found_chunks[chunk_id] = r.score
|
||||||
|
return found_chunks
|
||||||
|
|
||||||
|
|
||||||
async def search_chunks_embeddings(
|
async def search_chunks_embeddings(
|
||||||
@ -182,11 +188,17 @@ async def search_chunks_embeddings(
|
|||||||
limit: int = 10,
|
limit: int = 10,
|
||||||
filters: SearchFilters = SearchFilters(),
|
filters: SearchFilters = SearchFilters(),
|
||||||
timeout: int = 2,
|
timeout: int = 2,
|
||||||
) -> list[str]:
|
) -> dict[str, float]:
|
||||||
|
"""
|
||||||
|
Search chunks using embeddings across text and multimodal collections.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- Dictionary mapping chunk IDs to their similarity scores
|
||||||
|
"""
|
||||||
# Note: Multimodal embeddings typically produce higher similarity scores,
|
# Note: Multimodal embeddings typically produce higher similarity scores,
|
||||||
# so we use a higher threshold (0.4) to maintain selectivity.
|
# so we use a higher threshold (0.4) to maintain selectivity.
|
||||||
# Text embeddings produce lower scores, so we use 0.25.
|
# Text embeddings produce lower scores, so we use 0.25.
|
||||||
all_ids = await asyncio.gather(
|
all_results = await asyncio.gather(
|
||||||
asyncio.wait_for(
|
asyncio.wait_for(
|
||||||
search_chunks(
|
search_chunks(
|
||||||
data,
|
data,
|
||||||
@ -210,4 +222,10 @@ async def search_chunks_embeddings(
|
|||||||
timeout,
|
timeout,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
return list({id for ids in all_ids for id in ids})
|
# Merge scores, taking max if chunk appears in both
|
||||||
|
merged_scores: dict[str, float] = {}
|
||||||
|
for result_dict in all_results:
|
||||||
|
for chunk_id, score in result_dict.items():
|
||||||
|
if chunk_id not in merged_scores or score > merged_scores[chunk_id]:
|
||||||
|
merged_scores[chunk_id] = score
|
||||||
|
return merged_scores
|
||||||
|
|||||||
@ -21,6 +21,59 @@ from memory.api.search.types import SearchConfig, SearchFilters, SearchResult
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Weight for embedding scores vs BM25 scores in hybrid fusion
|
||||||
|
# Higher values favor semantic similarity over keyword matching
|
||||||
|
EMBEDDING_WEIGHT = 0.7
|
||||||
|
BM25_WEIGHT = 0.3
|
||||||
|
|
||||||
|
# Bonus for results that appear in both embedding and BM25 search
|
||||||
|
# This rewards documents that match both semantically and lexically
|
||||||
|
HYBRID_BONUS = 0.15
|
||||||
|
|
||||||
|
# Multiplier for internal search limit before fusion
|
||||||
|
# We search for more candidates than requested, fuse scores, then return top N
|
||||||
|
# This helps find results that rank well in one method but not the other
|
||||||
|
CANDIDATE_MULTIPLIER = 5
|
||||||
|
|
||||||
|
|
||||||
|
def fuse_scores(
|
||||||
|
embedding_scores: dict[str, float],
|
||||||
|
bm25_scores: dict[str, float],
|
||||||
|
) -> dict[str, float]:
|
||||||
|
"""
|
||||||
|
Fuse embedding and BM25 scores using weighted combination with hybrid bonus.
|
||||||
|
|
||||||
|
Documents appearing in both search results get a bonus, as matching both
|
||||||
|
semantic similarity AND keyword relevance is a strong signal.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embedding_scores: Dict mapping chunk IDs to embedding similarity scores (0-1)
|
||||||
|
bm25_scores: Dict mapping chunk IDs to normalized BM25 scores (0-1)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping chunk IDs to fused scores (0-1 range)
|
||||||
|
"""
|
||||||
|
all_ids = set(embedding_scores.keys()) | set(bm25_scores.keys())
|
||||||
|
fused: dict[str, float] = {}
|
||||||
|
|
||||||
|
for chunk_id in all_ids:
|
||||||
|
emb_score = embedding_scores.get(chunk_id, 0.0)
|
||||||
|
bm25_score = bm25_scores.get(chunk_id, 0.0)
|
||||||
|
|
||||||
|
# Check if result appears in both methods
|
||||||
|
in_both = chunk_id in embedding_scores and chunk_id in bm25_scores
|
||||||
|
|
||||||
|
# Weighted combination
|
||||||
|
combined = (EMBEDDING_WEIGHT * emb_score) + (BM25_WEIGHT * bm25_score)
|
||||||
|
|
||||||
|
# Add bonus for appearing in both (strong relevance signal)
|
||||||
|
if in_both:
|
||||||
|
combined = min(1.0, combined + HYBRID_BONUS)
|
||||||
|
|
||||||
|
fused[chunk_id] = combined
|
||||||
|
|
||||||
|
return fused
|
||||||
|
|
||||||
|
|
||||||
async def search_chunks(
|
async def search_chunks(
|
||||||
data: list[extract.DataChunk],
|
data: list[extract.DataChunk],
|
||||||
@ -29,14 +82,40 @@ async def search_chunks(
|
|||||||
filters: SearchFilters = {},
|
filters: SearchFilters = {},
|
||||||
timeout: int = 2,
|
timeout: int = 2,
|
||||||
) -> list[Chunk]:
|
) -> list[Chunk]:
|
||||||
funcs = [search_chunks_embeddings]
|
"""
|
||||||
if settings.ENABLE_BM25_SEARCH:
|
Search chunks using embedding similarity and optionally BM25.
|
||||||
funcs.append(search_bm25_chunks)
|
|
||||||
|
|
||||||
all_ids = await asyncio.gather(
|
Combines results using weighted score fusion, giving bonus to documents
|
||||||
*[func(data, modalities, limit, filters, timeout) for func in funcs]
|
that match both semantically and lexically.
|
||||||
|
"""
|
||||||
|
# Search for more candidates than requested, fuse scores, then return top N
|
||||||
|
# This helps find results that rank well in one method but not the other
|
||||||
|
internal_limit = limit * CANDIDATE_MULTIPLIER
|
||||||
|
|
||||||
|
# Run embedding search
|
||||||
|
embedding_scores = await search_chunks_embeddings(
|
||||||
|
data, modalities, internal_limit, filters, timeout
|
||||||
)
|
)
|
||||||
all_ids = {id for ids in all_ids for id in ids}
|
|
||||||
|
# Run BM25 search if enabled
|
||||||
|
bm25_scores: dict[str, float] = {}
|
||||||
|
if settings.ENABLE_BM25_SEARCH:
|
||||||
|
try:
|
||||||
|
bm25_scores = await search_bm25_chunks(
|
||||||
|
data, modalities, internal_limit, filters, timeout
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("BM25 search timed out, using embedding results only")
|
||||||
|
|
||||||
|
# Fuse scores from both methods
|
||||||
|
fused_scores = fuse_scores(embedding_scores, bm25_scores)
|
||||||
|
|
||||||
|
if not fused_scores:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Sort by score and take top results
|
||||||
|
sorted_ids = sorted(fused_scores.keys(), key=lambda x: fused_scores[x], reverse=True)
|
||||||
|
top_ids = sorted_ids[:limit]
|
||||||
|
|
||||||
with make_session() as db:
|
with make_session() as db:
|
||||||
chunks = (
|
chunks = (
|
||||||
@ -49,9 +128,14 @@ async def search_chunks(
|
|||||||
Chunk.file_paths, # type: ignore
|
Chunk.file_paths, # type: ignore
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
.filter(Chunk.id.in_(all_ids))
|
.filter(Chunk.id.in_(top_ids))
|
||||||
.all()
|
.all()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Set relevance_score on each chunk from the fused scores
|
||||||
|
for chunk in chunks:
|
||||||
|
chunk.relevance_score = fused_scores.get(str(chunk.id), 0.0)
|
||||||
|
|
||||||
db.expunge_all()
|
db.expunge_all()
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
|||||||
@ -41,9 +41,11 @@ class SearchResult(BaseModel):
|
|||||||
metadata.pop("content", None)
|
metadata.pop("content", None)
|
||||||
chunk_size = settings.DEFAULT_CHUNK_TOKENS * 4
|
chunk_size = settings.DEFAULT_CHUNK_TOKENS * 4
|
||||||
|
|
||||||
# Use mean of chunk scores to avoid bias towards documents with more chunks
|
# Use max chunk score - we want to find documents with at least one
|
||||||
|
# highly relevant section, not penalize long documents with some irrelevant parts.
|
||||||
|
# This is better for "half-remembered" searches where users recall one specific detail.
|
||||||
search_score = (
|
search_score = (
|
||||||
sum(chunk.relevance_score for chunk in chunks) / len(chunks)
|
max((chunk.relevance_score for chunk in chunks), default=0)
|
||||||
if chunks
|
if chunks
|
||||||
else 0
|
else 0
|
||||||
)
|
)
|
||||||
@ -76,7 +78,7 @@ class SearchFilters(TypedDict):
|
|||||||
|
|
||||||
|
|
||||||
class SearchConfig(BaseModel):
|
class SearchConfig(BaseModel):
|
||||||
limit: int = 10
|
limit: int = 20
|
||||||
timeout: int = 20
|
timeout: int = 20
|
||||||
previews: bool = False
|
previews: bool = False
|
||||||
useScores: bool = False
|
useScores: bool = False
|
||||||
|
|||||||
@ -24,6 +24,7 @@ from sqlalchemy import (
|
|||||||
func,
|
func,
|
||||||
UniqueConstraint,
|
UniqueConstraint,
|
||||||
)
|
)
|
||||||
|
from sqlalchemy.dialects.postgresql import TSVECTOR
|
||||||
from sqlalchemy.dialects.postgresql import BYTEA
|
from sqlalchemy.dialects.postgresql import BYTEA
|
||||||
from sqlalchemy.orm import Session, relationship
|
from sqlalchemy.orm import Session, relationship
|
||||||
from sqlalchemy.types import Numeric
|
from sqlalchemy.types import Numeric
|
||||||
@ -155,6 +156,7 @@ class Chunk(Base):
|
|||||||
collection_name = Column(Text)
|
collection_name = Column(Text)
|
||||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||||
checked_at = Column(DateTime(timezone=True), server_default=func.now())
|
checked_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||||
|
search_vector = Column(TSVECTOR) # Full-text search index
|
||||||
|
|
||||||
vector: list[float] = []
|
vector: list[float] = []
|
||||||
item_metadata: dict[str, Any] = {}
|
item_metadata: dict[str, Any] = {}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user