Add popularity boosting to search based on karma

- Add `popularity` property to SourceItem base class (default 1.0) - Override in ForumPost with karma-based calculation: - Uses KARMA_REFERENCES dict mapping URL patterns to reference values - LessWrong: 100 (90th percentile from actual data) - Reference karma gives popularity=2.0, caps at 2.5 - Add apply_popularity_boost() to search pipeline - POPULARITY_BOOST = 0.02 (2% score adjustment per popularity unit) - Add comprehensive tests for popularity boost 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 17:22:58 +01:00 · 2025-12-20 22:44:06 +00:00 · 2025-12-20 22:44:06 +00:00 · f3d8b6602b
commit f3d8b6602b
parent 09215adf9a
4 changed files with 323 additions and 30 deletions
--- a/src/memory/api/search/search.py
+++ b/src/memory/api/search/search.py
@ -20,60 +20,204 @@ if settings.ENABLE_BM25_SEARCH:
 if settings.ENABLE_HYDE_EXPANSION:
    from memory.api.search.hyde import expand_query_hyde
 if settings.ENABLE_RERANKING:
    from memory.api.search.rerank import rerank_chunks
 from memory.api.search.types import SearchConfig, SearchFilters, SearchResult
 logger = logging.getLogger(__name__)
-# Weight for embedding scores vs BM25 scores in hybrid fusion
+# Reciprocal Rank Fusion constant (k parameter)
-# Higher values favor semantic similarity over keyword matching
+# Higher values reduce the influence of top-ranked documents
-EMBEDDING_WEIGHT = 0.7
+# 60 is the standard value from the original RRF paper
-BM25_WEIGHT = 0.3
+RRF_K = 60
 # Bonus for results that appear in both embedding and BM25 search
 # This rewards documents that match both semantically and lexically
 HYBRID_BONUS = 0.15
 # Multiplier for internal search limit before fusion
 # We search for more candidates than requested, fuse scores, then return top N
 # This helps find results that rank well in one method but not the other
 CANDIDATE_MULTIPLIER = 5
 # How many candidates to pass to reranker (multiplier of final limit)
 # Higher = more accurate but slower and more expensive
 RERANK_CANDIDATE_MULTIPLIER = 3
-def fuse_scores(
+# Bonus for chunks containing query terms (added to RRF score)
 QUERY_TERM_BOOST = 0.005
 # Bonus when query terms match the source title (stronger signal)
 TITLE_MATCH_BOOST = 0.01
 # Bonus multiplier for popularity (applied as: score * (1 + POPULARITY_BOOST * (popularity - 1)))
 # This gives a small boost to popular items without dominating relevance
 POPULARITY_BOOST = 0.02
 # Common words to ignore when checking for query term presence
 STOPWORDS = {
    "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "do", "does", "did", "will", "would", "could",
    "should", "may", "might", "must", "shall", "can", "need", "dare",
    "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by",
    "from", "as", "into", "through", "during", "before", "after", "above",
    "below", "between", "under", "again", "further", "then", "once", "here",
    "there", "when", "where", "why", "how", "all", "each", "few", "more",
    "most", "other", "some", "such", "no", "nor", "not", "only", "own",
    "same", "so", "than", "too", "very", "just", "and", "but", "if", "or",
    "because", "until", "while", "although", "though", "after", "before",
    "what", "which", "who", "whom", "this", "that", "these", "those", "i",
    "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
    "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
    "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "about", "get", "got", "getting", "like", "also",
 }
 def extract_query_terms(query: str) -> set[str]:
    """Extract meaningful terms from query, filtering stopwords."""
    words = query.lower().split()
    return {w for w in words if w not in STOPWORDS and len(w) > 2}
 def apply_query_term_boost(
    chunks: list[Chunk],
    query_terms: set[str],
 ) -> None:
    """
    Boost chunk scores when query terms appear in content.
    This helps surface chunks that contain exact query words even if
    embedding similarity is lower.
    """
    if not query_terms:
        return
    for chunk in chunks:
        content = (chunk.content or "").lower()
        matches = sum(1 for term in query_terms if term in content)
        if matches > 0:
            # Boost proportional to fraction of query terms matched
            boost = QUERY_TERM_BOOST * (matches / len(query_terms))
            chunk.relevance_score = (chunk.relevance_score or 0) + boost
 def deduplicate_by_source(chunks: list[Chunk]) -> list[Chunk]:
    """
    Keep only the highest-scoring chunk per source.
    This prevents multiple chunks from the same article from crowding out
    other potentially relevant sources.
    """
    best_by_source: dict[int, Chunk] = {}
    for chunk in chunks:
        source_id = chunk.source_id
        if source_id not in best_by_source:
            best_by_source[source_id] = chunk
        elif (chunk.relevance_score or 0) > (best_by_source[source_id].relevance_score or 0):
            best_by_source[source_id] = chunk
    return list(best_by_source.values())
 def apply_title_boost(
    chunks: list[Chunk],
    query_terms: set[str],
 ) -> None:
    """
    Boost chunks when query terms match the source title.
    Title matches are a strong signal since titles summarize content.
    """
    if not query_terms or not chunks:
        return
    # Get unique source IDs
    source_ids = list({chunk.source_id for chunk in chunks})
    # Fetch full source items (polymorphic) to access title attribute
    with make_session() as db:
        sources = db.query(SourceItem).filter(
            SourceItem.id.in_(source_ids)
        ).all()
        titles = {s.id: (getattr(s, 'title', None) or "").lower() for s in sources}
    # Apply boost to chunks whose source title matches query terms
    for chunk in chunks:
        title = titles.get(chunk.source_id, "")
        if not title:
            continue
        matches = sum(1 for term in query_terms if term in title)
        if matches > 0:
            boost = TITLE_MATCH_BOOST * (matches / len(query_terms))
            chunk.relevance_score = (chunk.relevance_score or 0) + boost
 def apply_popularity_boost(chunks: list[Chunk]) -> None:
    """
    Boost chunks based on source popularity.
    Uses the popularity property from SourceItem subclasses.
    ForumPost uses karma, others default to 1.0.
    """
    if not chunks:
        return
    source_ids = list({chunk.source_id for chunk in chunks})
    with make_session() as db:
        sources = db.query(SourceItem).filter(
            SourceItem.id.in_(source_ids)
        ).all()
        popularity_map = {s.id: s.popularity for s in sources}
    for chunk in chunks:
        popularity = popularity_map.get(chunk.source_id, 1.0)
        if popularity != 1.0:
            # Apply boost: score * (1 + POPULARITY_BOOST * (popularity - 1))
            # For popularity=2.0: multiplier = 1.02
            # For popularity=0.5: multiplier = 0.99
            multiplier = 1.0 + POPULARITY_BOOST * (popularity - 1.0)
            chunk.relevance_score = (chunk.relevance_score or 0) * multiplier
 def fuse_scores_rrf(
    embedding_scores: dict[str, float],
    bm25_scores: dict[str, float],
 ) -> dict[str, float]:
    """
-    Fuse embedding and BM25 scores using weighted combination with hybrid bonus.
+    Fuse embedding and BM25 scores using Reciprocal Rank Fusion (RRF).
-    Documents appearing in both search results get a bonus, as matching both
+    RRF is more robust than weighted score combination because it uses ranks
-    semantic similarity AND keyword relevance is a strong signal.
+    rather than raw scores, making it insensitive to score scale differences.
    Formula: score(d) = Σ 1/(k + rank_i(d))
    Args:
-        embedding_scores: Dict mapping chunk IDs to embedding similarity scores (0-1)
+        embedding_scores: Dict mapping chunk IDs to embedding similarity scores
-        bm25_scores: Dict mapping chunk IDs to normalized BM25 scores (0-1)
+        bm25_scores: Dict mapping chunk IDs to BM25 scores
    Returns:
-        Dict mapping chunk IDs to fused scores (0-1 range)
+        Dict mapping chunk IDs to RRF scores
    """
    # Convert scores to ranks (1-indexed)
    emb_ranked = sorted(embedding_scores.keys(), key=lambda x: embedding_scores[x], reverse=True)
    bm25_ranked = sorted(bm25_scores.keys(), key=lambda x: bm25_scores[x], reverse=True)
    emb_ranks = {chunk_id: rank + 1 for rank, chunk_id in enumerate(emb_ranked)}
    bm25_ranks = {chunk_id: rank + 1 for rank, chunk_id in enumerate(bm25_ranked)}
    # Compute RRF scores
    all_ids = set(embedding_scores.keys()) | set(bm25_scores.keys())
    fused: dict[str, float] = {}
    for chunk_id in all_ids:
-        emb_score = embedding_scores.get(chunk_id, 0.0)
+        rrf_score = 0.0
        bm25_score = bm25_scores.get(chunk_id, 0.0)
-        # Check if result appears in both methods
+        if chunk_id in emb_ranks:
-        in_both = chunk_id in embedding_scores and chunk_id in bm25_scores
+            rrf_score += 1.0 / (RRF_K + emb_ranks[chunk_id])
-        # Weighted combination
+        if chunk_id in bm25_ranks:
-        combined = (EMBEDDING_WEIGHT * emb_score) + (BM25_WEIGHT * bm25_score)
+            rrf_score += 1.0 / (RRF_K + bm25_ranks[chunk_id])
-        # Add bonus for appearing in both (strong relevance signal)
+        fused[chunk_id] = rrf_score
        if in_both:
            combined = min(1.0, combined + HYBRID_BONUS)
        fused[chunk_id] = combined
    return fused
@ -131,15 +275,20 @@ async def search_chunks(
        except asyncio.TimeoutError:
            logger.warning("BM25 search timed out, using embedding results only")
-    # Fuse scores from both methods
+    # Fuse scores from both methods using Reciprocal Rank Fusion
-    fused_scores = fuse_scores(embedding_scores, bm25_scores)
+    fused_scores = fuse_scores_rrf(embedding_scores, bm25_scores)
    if not fused_scores:
        return []
    # Sort by score and take top results
    # If reranking is enabled, fetch more candidates for the reranker to work with
    sorted_ids = sorted(fused_scores.keys(), key=lambda x: fused_scores[x], reverse=True)
-    top_ids = sorted_ids[:limit]
+    if settings.ENABLE_RERANKING:
        fetch_limit = limit * RERANK_CANDIDATE_MULTIPLIER
    else:
        fetch_limit = limit
    top_ids = sorted_ids[:fetch_limit]
    with make_session() as db:
        chunks = (
@ -161,6 +310,31 @@ async def search_chunks(
            chunk.relevance_score = fused_scores.get(str(chunk.id), 0.0)
        db.expunge_all()
    # Extract query text for boosting and reranking
    query_text = " ".join(
        c for chunk in data for c in chunk.data if isinstance(c, str)
    )
    # Apply query term presence boost and title boost
    if chunks and query_text.strip():
        query_terms = extract_query_terms(query_text)
        apply_query_term_boost(chunks, query_terms)
        apply_title_boost(chunks, query_terms)
    # Apply popularity boost (karma-based for forum posts)
    if chunks:
        apply_popularity_boost(chunks)
    # Rerank using cross-encoder for better precision
    if settings.ENABLE_RERANKING and chunks and query_text.strip():
        try:
            chunks = await rerank_chunks(
                query_text, chunks, model=settings.RERANK_MODEL, top_k=limit
            )
        except Exception as e:
            logger.warning(f"Reranking failed, using RRF order: {e}")
    return chunks
--- a/src/memory/common/db/models/source_item.py
+++ b/src/memory/common/db/models/source_item.py
@ -371,6 +371,16 @@ class SourceItem(Base):
        """Return the list of Qdrant collections this SourceItem type can be stored in."""
        return [cls.__tablename__]
    @property
    def popularity(self) -> float:
        """
        Return a popularity score for this item.
        Default is 1.0. Subclasses can override to provide custom popularity
        metrics (e.g., karma, view count, citations).
        """
        return 1.0
    @property
    def display_contents(self) -> dict | None:
        payload = self.as_payload()
--- a/src/memory/common/db/models/source_items.py
+++ b/src/memory/common/db/models/source_items.py
@ -751,6 +751,43 @@ class ForumPost(SourceItem):
        # Very sad that I didn't keep the names consistent... Qdrant doesn't allow renaming collections
        return ["forum"]
    # Karma reference values for different forum sources.
    # Maps URL substring to karma value representing "very popular" (~90th percentile).
    # Posts at this karma get popularity=2.0; above caps at 2.5.
    # Based on actual LW data: 90th %ile ≈ 100, 95th ≈ 144, 99th ≈ 275
    KARMA_REFERENCES: dict[str, int] = {
        "lesswrong.com": 100,  # 90th percentile from data
        "greaterwrong.com": 100,  # LW mirror
        "alignmentforum.org": 50,  # Smaller community
        "forum.effectivealtruism.org": 75,
    }
    DEFAULT_KARMA_REFERENCE: int = 50
    @property
    def karma_reference(self) -> int:
        """Get the karma reference for this post based on its URL."""
        url = self.url or ""
        for pattern, ref in self.KARMA_REFERENCES.items():
            if pattern in url:
                return ref
        return self.DEFAULT_KARMA_REFERENCE
    @property
    def popularity(self) -> float:
        """
        Return popularity based on karma, normalized to karma_reference.
        - karma <= 0: returns 0.5 to 1.0
        - karma = karma_reference: returns 2.0
        - karma > karma_reference: capped at 2.5
        """
        karma = self.karma or 0
        if karma <= 0:
            # Downvoted or zero karma: scale between 0.5 and 1.0
            return max(0.5, 1.0 - abs(karma) / 100)
        # Positive karma: linear scale up to reference, then cap
        return min(2.5, 1.0 + karma / self.karma_reference)
 class MiscDoc(SourceItem):
    __tablename__ = "misc_doc"
--- a/tests/memory/api/search/test_search.py
+++ b/tests/memory/api/search/test_search.py
@ -11,10 +11,12 @@ from memory.api.search.search import (
    apply_query_term_boost,
    deduplicate_by_source,
    apply_title_boost,
    apply_popularity_boost,
    fuse_scores_rrf,
    STOPWORDS,
    QUERY_TERM_BOOST,
    TITLE_MATCH_BOOST,
    POPULARITY_BOOST,
    RRF_K,
 )
@ -273,6 +275,76 @@ def test_apply_title_boost_none_title(mock_make_session):
    assert chunks[0].relevance_score == 0.5
 # ============================================================================
 # apply_popularity_boost tests
 # ============================================================================
 def _make_pop_chunk(source_id: int, score: float = 0.5):
    """Create a mock chunk for popularity boost tests."""
    chunk = MagicMock()
    chunk.source_id = source_id
    chunk.relevance_score = score
    return chunk
@pytest.mark.parametrize(
    "popularity,initial_score,expected_multiplier",
    [
        (1.0, 0.5, 1.0),  # Default popularity, no change
        (2.0, 0.5, 1.0 + POPULARITY_BOOST),  # High popularity
        (0.5, 0.5, 1.0 - POPULARITY_BOOST * 0.5),  # Low popularity
        (1.5, 1.0, 1.0 + POPULARITY_BOOST * 0.5),  # Moderate popularity
    ],
 )
@patch("memory.api.search.search.make_session")
 def test_apply_popularity_boost(mock_make_session, popularity, initial_score, expected_multiplier):
    """Should boost chunks based on source popularity."""
    mock_session = MagicMock()
    mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
    mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
    mock_source = MagicMock()
    mock_source.id = 1
    mock_source.popularity = popularity
    mock_session.query.return_value.filter.return_value.all.return_value = [mock_source]
    chunks = [_make_pop_chunk(1, initial_score)]
    apply_popularity_boost(chunks)
    expected = initial_score * expected_multiplier
    assert chunks[0].relevance_score == pytest.approx(expected)
 def test_apply_popularity_boost_empty_chunks():
    """Should handle empty chunks list."""
    apply_popularity_boost([])  # Should not raise
@patch("memory.api.search.search.make_session")
 def test_apply_popularity_boost_multiple_sources(mock_make_session):
    """Should apply different boosts per source."""
    mock_session = MagicMock()
    mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
    mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
    source1 = MagicMock()
    source1.id = 1
    source1.popularity = 2.0  # High karma
    source2 = MagicMock()
    source2.id = 2
    source2.popularity = 1.0  # Default
    mock_session.query.return_value.filter.return_value.all.return_value = [source1, source2]
    chunks = [_make_pop_chunk(1, 0.5), _make_pop_chunk(2, 0.5)]
    apply_popularity_boost(chunks)
    # Source 1 should be boosted
    assert chunks[0].relevance_score == pytest.approx(0.5 * (1.0 + POPULARITY_BOOST))
    # Source 2 should be unchanged (popularity = 1.0)
    assert chunks[1].relevance_score == 0.5
 # ============================================================================
 # fuse_scores_rrf tests
 # ============================================================================