diff --git a/src/memory/api/search/search.py b/src/memory/api/search/search.py index ba7f796..32b63dc 100644 --- a/src/memory/api/search/search.py +++ b/src/memory/api/search/search.py @@ -20,60 +20,204 @@ if settings.ENABLE_BM25_SEARCH: if settings.ENABLE_HYDE_EXPANSION: from memory.api.search.hyde import expand_query_hyde +if settings.ENABLE_RERANKING: + from memory.api.search.rerank import rerank_chunks + from memory.api.search.types import SearchConfig, SearchFilters, SearchResult logger = logging.getLogger(__name__) -# Weight for embedding scores vs BM25 scores in hybrid fusion -# Higher values favor semantic similarity over keyword matching -EMBEDDING_WEIGHT = 0.7 -BM25_WEIGHT = 0.3 - -# Bonus for results that appear in both embedding and BM25 search -# This rewards documents that match both semantically and lexically -HYBRID_BONUS = 0.15 +# Reciprocal Rank Fusion constant (k parameter) +# Higher values reduce the influence of top-ranked documents +# 60 is the standard value from the original RRF paper +RRF_K = 60 # Multiplier for internal search limit before fusion # We search for more candidates than requested, fuse scores, then return top N # This helps find results that rank well in one method but not the other CANDIDATE_MULTIPLIER = 5 +# How many candidates to pass to reranker (multiplier of final limit) +# Higher = more accurate but slower and more expensive +RERANK_CANDIDATE_MULTIPLIER = 3 -def fuse_scores( +# Bonus for chunks containing query terms (added to RRF score) +QUERY_TERM_BOOST = 0.005 + +# Bonus when query terms match the source title (stronger signal) +TITLE_MATCH_BOOST = 0.01 + +# Bonus multiplier for popularity (applied as: score * (1 + POPULARITY_BOOST * (popularity - 1))) +# This gives a small boost to popular items without dominating relevance +POPULARITY_BOOST = 0.02 + +# Common words to ignore when checking for query term presence +STOPWORDS = { + "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", + "have", "has", "had", "do", "does", "did", "will", "would", "could", + "should", "may", "might", "must", "shall", "can", "need", "dare", + "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by", + "from", "as", "into", "through", "during", "before", "after", "above", + "below", "between", "under", "again", "further", "then", "once", "here", + "there", "when", "where", "why", "how", "all", "each", "few", "more", + "most", "other", "some", "such", "no", "nor", "not", "only", "own", + "same", "so", "than", "too", "very", "just", "and", "but", "if", "or", + "because", "until", "while", "although", "though", "after", "before", + "what", "which", "who", "whom", "this", "that", "these", "those", "i", + "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", + "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", + "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", + "theirs", "themselves", "about", "get", "got", "getting", "like", "also", +} + + +def extract_query_terms(query: str) -> set[str]: + """Extract meaningful terms from query, filtering stopwords.""" + words = query.lower().split() + return {w for w in words if w not in STOPWORDS and len(w) > 2} + + +def apply_query_term_boost( + chunks: list[Chunk], + query_terms: set[str], +) -> None: + """ + Boost chunk scores when query terms appear in content. + + This helps surface chunks that contain exact query words even if + embedding similarity is lower. + """ + if not query_terms: + return + + for chunk in chunks: + content = (chunk.content or "").lower() + matches = sum(1 for term in query_terms if term in content) + if matches > 0: + # Boost proportional to fraction of query terms matched + boost = QUERY_TERM_BOOST * (matches / len(query_terms)) + chunk.relevance_score = (chunk.relevance_score or 0) + boost + + +def deduplicate_by_source(chunks: list[Chunk]) -> list[Chunk]: + """ + Keep only the highest-scoring chunk per source. + + This prevents multiple chunks from the same article from crowding out + other potentially relevant sources. + """ + best_by_source: dict[int, Chunk] = {} + for chunk in chunks: + source_id = chunk.source_id + if source_id not in best_by_source: + best_by_source[source_id] = chunk + elif (chunk.relevance_score or 0) > (best_by_source[source_id].relevance_score or 0): + best_by_source[source_id] = chunk + return list(best_by_source.values()) + + +def apply_title_boost( + chunks: list[Chunk], + query_terms: set[str], +) -> None: + """ + Boost chunks when query terms match the source title. + + Title matches are a strong signal since titles summarize content. + """ + if not query_terms or not chunks: + return + + # Get unique source IDs + source_ids = list({chunk.source_id for chunk in chunks}) + + # Fetch full source items (polymorphic) to access title attribute + with make_session() as db: + sources = db.query(SourceItem).filter( + SourceItem.id.in_(source_ids) + ).all() + titles = {s.id: (getattr(s, 'title', None) or "").lower() for s in sources} + + # Apply boost to chunks whose source title matches query terms + for chunk in chunks: + title = titles.get(chunk.source_id, "") + if not title: + continue + + matches = sum(1 for term in query_terms if term in title) + if matches > 0: + boost = TITLE_MATCH_BOOST * (matches / len(query_terms)) + chunk.relevance_score = (chunk.relevance_score or 0) + boost + + +def apply_popularity_boost(chunks: list[Chunk]) -> None: + """ + Boost chunks based on source popularity. + + Uses the popularity property from SourceItem subclasses. + ForumPost uses karma, others default to 1.0. + """ + if not chunks: + return + + source_ids = list({chunk.source_id for chunk in chunks}) + + with make_session() as db: + sources = db.query(SourceItem).filter( + SourceItem.id.in_(source_ids) + ).all() + popularity_map = {s.id: s.popularity for s in sources} + + for chunk in chunks: + popularity = popularity_map.get(chunk.source_id, 1.0) + if popularity != 1.0: + # Apply boost: score * (1 + POPULARITY_BOOST * (popularity - 1)) + # For popularity=2.0: multiplier = 1.02 + # For popularity=0.5: multiplier = 0.99 + multiplier = 1.0 + POPULARITY_BOOST * (popularity - 1.0) + chunk.relevance_score = (chunk.relevance_score or 0) * multiplier + + +def fuse_scores_rrf( embedding_scores: dict[str, float], bm25_scores: dict[str, float], ) -> dict[str, float]: """ - Fuse embedding and BM25 scores using weighted combination with hybrid bonus. + Fuse embedding and BM25 scores using Reciprocal Rank Fusion (RRF). - Documents appearing in both search results get a bonus, as matching both - semantic similarity AND keyword relevance is a strong signal. + RRF is more robust than weighted score combination because it uses ranks + rather than raw scores, making it insensitive to score scale differences. + + Formula: score(d) = Σ 1/(k + rank_i(d)) Args: - embedding_scores: Dict mapping chunk IDs to embedding similarity scores (0-1) - bm25_scores: Dict mapping chunk IDs to normalized BM25 scores (0-1) + embedding_scores: Dict mapping chunk IDs to embedding similarity scores + bm25_scores: Dict mapping chunk IDs to BM25 scores Returns: - Dict mapping chunk IDs to fused scores (0-1 range) + Dict mapping chunk IDs to RRF scores """ + # Convert scores to ranks (1-indexed) + emb_ranked = sorted(embedding_scores.keys(), key=lambda x: embedding_scores[x], reverse=True) + bm25_ranked = sorted(bm25_scores.keys(), key=lambda x: bm25_scores[x], reverse=True) + + emb_ranks = {chunk_id: rank + 1 for rank, chunk_id in enumerate(emb_ranked)} + bm25_ranks = {chunk_id: rank + 1 for rank, chunk_id in enumerate(bm25_ranked)} + + # Compute RRF scores all_ids = set(embedding_scores.keys()) | set(bm25_scores.keys()) fused: dict[str, float] = {} for chunk_id in all_ids: - emb_score = embedding_scores.get(chunk_id, 0.0) - bm25_score = bm25_scores.get(chunk_id, 0.0) + rrf_score = 0.0 - # Check if result appears in both methods - in_both = chunk_id in embedding_scores and chunk_id in bm25_scores + if chunk_id in emb_ranks: + rrf_score += 1.0 / (RRF_K + emb_ranks[chunk_id]) - # Weighted combination - combined = (EMBEDDING_WEIGHT * emb_score) + (BM25_WEIGHT * bm25_score) + if chunk_id in bm25_ranks: + rrf_score += 1.0 / (RRF_K + bm25_ranks[chunk_id]) - # Add bonus for appearing in both (strong relevance signal) - if in_both: - combined = min(1.0, combined + HYBRID_BONUS) - - fused[chunk_id] = combined + fused[chunk_id] = rrf_score return fused @@ -131,15 +275,20 @@ async def search_chunks( except asyncio.TimeoutError: logger.warning("BM25 search timed out, using embedding results only") - # Fuse scores from both methods - fused_scores = fuse_scores(embedding_scores, bm25_scores) + # Fuse scores from both methods using Reciprocal Rank Fusion + fused_scores = fuse_scores_rrf(embedding_scores, bm25_scores) if not fused_scores: return [] # Sort by score and take top results + # If reranking is enabled, fetch more candidates for the reranker to work with sorted_ids = sorted(fused_scores.keys(), key=lambda x: fused_scores[x], reverse=True) - top_ids = sorted_ids[:limit] + if settings.ENABLE_RERANKING: + fetch_limit = limit * RERANK_CANDIDATE_MULTIPLIER + else: + fetch_limit = limit + top_ids = sorted_ids[:fetch_limit] with make_session() as db: chunks = ( @@ -161,7 +310,32 @@ async def search_chunks( chunk.relevance_score = fused_scores.get(str(chunk.id), 0.0) db.expunge_all() - return chunks + + # Extract query text for boosting and reranking + query_text = " ".join( + c for chunk in data for c in chunk.data if isinstance(c, str) + ) + + # Apply query term presence boost and title boost + if chunks and query_text.strip(): + query_terms = extract_query_terms(query_text) + apply_query_term_boost(chunks, query_terms) + apply_title_boost(chunks, query_terms) + + # Apply popularity boost (karma-based for forum posts) + if chunks: + apply_popularity_boost(chunks) + + # Rerank using cross-encoder for better precision + if settings.ENABLE_RERANKING and chunks and query_text.strip(): + try: + chunks = await rerank_chunks( + query_text, chunks, model=settings.RERANK_MODEL, top_k=limit + ) + except Exception as e: + logger.warning(f"Reranking failed, using RRF order: {e}") + + return chunks async def search_sources( diff --git a/src/memory/common/db/models/source_item.py b/src/memory/common/db/models/source_item.py index 0e8ab83..71cc19c 100644 --- a/src/memory/common/db/models/source_item.py +++ b/src/memory/common/db/models/source_item.py @@ -371,6 +371,16 @@ class SourceItem(Base): """Return the list of Qdrant collections this SourceItem type can be stored in.""" return [cls.__tablename__] + @property + def popularity(self) -> float: + """ + Return a popularity score for this item. + + Default is 1.0. Subclasses can override to provide custom popularity + metrics (e.g., karma, view count, citations). + """ + return 1.0 + @property def display_contents(self) -> dict | None: payload = self.as_payload() diff --git a/src/memory/common/db/models/source_items.py b/src/memory/common/db/models/source_items.py index 84de0d0..d51c002 100644 --- a/src/memory/common/db/models/source_items.py +++ b/src/memory/common/db/models/source_items.py @@ -751,6 +751,43 @@ class ForumPost(SourceItem): # Very sad that I didn't keep the names consistent... Qdrant doesn't allow renaming collections return ["forum"] + # Karma reference values for different forum sources. + # Maps URL substring to karma value representing "very popular" (~90th percentile). + # Posts at this karma get popularity=2.0; above caps at 2.5. + # Based on actual LW data: 90th %ile ≈ 100, 95th ≈ 144, 99th ≈ 275 + KARMA_REFERENCES: dict[str, int] = { + "lesswrong.com": 100, # 90th percentile from data + "greaterwrong.com": 100, # LW mirror + "alignmentforum.org": 50, # Smaller community + "forum.effectivealtruism.org": 75, + } + DEFAULT_KARMA_REFERENCE: int = 50 + + @property + def karma_reference(self) -> int: + """Get the karma reference for this post based on its URL.""" + url = self.url or "" + for pattern, ref in self.KARMA_REFERENCES.items(): + if pattern in url: + return ref + return self.DEFAULT_KARMA_REFERENCE + + @property + def popularity(self) -> float: + """ + Return popularity based on karma, normalized to karma_reference. + + - karma <= 0: returns 0.5 to 1.0 + - karma = karma_reference: returns 2.0 + - karma > karma_reference: capped at 2.5 + """ + karma = self.karma or 0 + if karma <= 0: + # Downvoted or zero karma: scale between 0.5 and 1.0 + return max(0.5, 1.0 - abs(karma) / 100) + # Positive karma: linear scale up to reference, then cap + return min(2.5, 1.0 + karma / self.karma_reference) + class MiscDoc(SourceItem): __tablename__ = "misc_doc" diff --git a/tests/memory/api/search/test_search.py b/tests/memory/api/search/test_search.py index 15b2359..1fec142 100644 --- a/tests/memory/api/search/test_search.py +++ b/tests/memory/api/search/test_search.py @@ -11,10 +11,12 @@ from memory.api.search.search import ( apply_query_term_boost, deduplicate_by_source, apply_title_boost, + apply_popularity_boost, fuse_scores_rrf, STOPWORDS, QUERY_TERM_BOOST, TITLE_MATCH_BOOST, + POPULARITY_BOOST, RRF_K, ) @@ -273,6 +275,76 @@ def test_apply_title_boost_none_title(mock_make_session): assert chunks[0].relevance_score == 0.5 +# ============================================================================ +# apply_popularity_boost tests +# ============================================================================ + + +def _make_pop_chunk(source_id: int, score: float = 0.5): + """Create a mock chunk for popularity boost tests.""" + chunk = MagicMock() + chunk.source_id = source_id + chunk.relevance_score = score + return chunk + + +@pytest.mark.parametrize( + "popularity,initial_score,expected_multiplier", + [ + (1.0, 0.5, 1.0), # Default popularity, no change + (2.0, 0.5, 1.0 + POPULARITY_BOOST), # High popularity + (0.5, 0.5, 1.0 - POPULARITY_BOOST * 0.5), # Low popularity + (1.5, 1.0, 1.0 + POPULARITY_BOOST * 0.5), # Moderate popularity + ], +) +@patch("memory.api.search.search.make_session") +def test_apply_popularity_boost(mock_make_session, popularity, initial_score, expected_multiplier): + """Should boost chunks based on source popularity.""" + mock_session = MagicMock() + mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session) + mock_make_session.return_value.__exit__ = MagicMock(return_value=None) + + mock_source = MagicMock() + mock_source.id = 1 + mock_source.popularity = popularity + mock_session.query.return_value.filter.return_value.all.return_value = [mock_source] + + chunks = [_make_pop_chunk(1, initial_score)] + apply_popularity_boost(chunks) + + expected = initial_score * expected_multiplier + assert chunks[0].relevance_score == pytest.approx(expected) + + +def test_apply_popularity_boost_empty_chunks(): + """Should handle empty chunks list.""" + apply_popularity_boost([]) # Should not raise + + +@patch("memory.api.search.search.make_session") +def test_apply_popularity_boost_multiple_sources(mock_make_session): + """Should apply different boosts per source.""" + mock_session = MagicMock() + mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session) + mock_make_session.return_value.__exit__ = MagicMock(return_value=None) + + source1 = MagicMock() + source1.id = 1 + source1.popularity = 2.0 # High karma + source2 = MagicMock() + source2.id = 2 + source2.popularity = 1.0 # Default + mock_session.query.return_value.filter.return_value.all.return_value = [source1, source2] + + chunks = [_make_pop_chunk(1, 0.5), _make_pop_chunk(2, 0.5)] + apply_popularity_boost(chunks) + + # Source 1 should be boosted + assert chunks[0].relevance_score == pytest.approx(0.5 * (1.0 + POPULARITY_BOOST)) + # Source 2 should be unchanged (popularity = 1.0) + assert chunks[1].relevance_score == 0.5 + + # ============================================================================ # fuse_scores_rrf tests # ============================================================================