Add popularity boosting to search based on karma

- Add `popularity` property to SourceItem base class (default 1.0) - Override in ForumPost with karma-based calculation: - Uses KARMA_REFERENCES dict mapping URL patterns to reference values - LessWrong: 100 (90th percentile from actual data) - Reference karma gives popularity=2.0, caps at 2.5 - Add apply_popularity_boost() to search pipeline - POPULARITY_BOOST = 0.02 (2% score adjustment per popularity unit) - Add comprehensive tests for popularity boost 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 09:12:58 +01:00 · 2025-12-20 22:44:06 +00:00 · 2025-12-20 22:44:06 +00:00 · f3d8b6602b
commit f3d8b6602b
parent 09215adf9a
4 changed files with 323 additions and 30 deletions
--- a/src/memory/api/search/search.py
+++ b/src/memory/api/search/search.py
@ -20,60 +20,204 @@ if settings.ENABLE_BM25_SEARCH:
 if settings.ENABLE_HYDE_EXPANSION:
    from memory.api.search.hyde import expand_query_hyde

+if settings.ENABLE_RERANKING:
+    from memory.api.search.rerank import rerank_chunks
+
 from memory.api.search.types import SearchConfig, SearchFilters, SearchResult

 logger = logging.getLogger(__name__)

-# Weight for embedding scores vs BM25 scores in hybrid fusion
-# Higher values favor semantic similarity over keyword matching
-EMBEDDING_WEIGHT = 0.7
-BM25_WEIGHT = 0.3
-
-# Bonus for results that appear in both embedding and BM25 search
-# This rewards documents that match both semantically and lexically
-HYBRID_BONUS = 0.15
+# Reciprocal Rank Fusion constant (k parameter)
+# Higher values reduce the influence of top-ranked documents
+# 60 is the standard value from the original RRF paper
+RRF_K = 60

 # Multiplier for internal search limit before fusion
 # We search for more candidates than requested, fuse scores, then return top N
 # This helps find results that rank well in one method but not the other
 CANDIDATE_MULTIPLIER = 5

+# How many candidates to pass to reranker (multiplier of final limit)
+# Higher = more accurate but slower and more expensive
+RERANK_CANDIDATE_MULTIPLIER = 3

-def fuse_scores(
+# Bonus for chunks containing query terms (added to RRF score)
+QUERY_TERM_BOOST = 0.005
+
+# Bonus when query terms match the source title (stronger signal)
+TITLE_MATCH_BOOST = 0.01
+
+# Bonus multiplier for popularity (applied as: score * (1 + POPULARITY_BOOST * (popularity - 1)))
+# This gives a small boost to popular items without dominating relevance
+POPULARITY_BOOST = 0.02
+
+# Common words to ignore when checking for query term presence
+STOPWORDS = {
+    "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
+    "have", "has", "had", "do", "does", "did", "will", "would", "could",
+    "should", "may", "might", "must", "shall", "can", "need", "dare",
+    "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by",
+    "from", "as", "into", "through", "during", "before", "after", "above",
+    "below", "between", "under", "again", "further", "then", "once", "here",
+    "there", "when", "where", "why", "how", "all", "each", "few", "more",
+    "most", "other", "some", "such", "no", "nor", "not", "only", "own",
+    "same", "so", "than", "too", "very", "just", "and", "but", "if", "or",
+    "because", "until", "while", "although", "though", "after", "before",
+    "what", "which", "who", "whom", "this", "that", "these", "those", "i",
+    "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
+    "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
+    "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
+    "theirs", "themselves", "about", "get", "got", "getting", "like", "also",
+}
+
+
+def extract_query_terms(query: str) -> set[str]:
+    """Extract meaningful terms from query, filtering stopwords."""
+    words = query.lower().split()
+    return {w for w in words if w not in STOPWORDS and len(w) > 2}
+
+
+def apply_query_term_boost(
+    chunks: list[Chunk],
+    query_terms: set[str],
+) -> None:
+    """
+    Boost chunk scores when query terms appear in content.
+
+    This helps surface chunks that contain exact query words even if
+    embedding similarity is lower.
+    """
+    if not query_terms:
+        return
+
+    for chunk in chunks:
+        content = (chunk.content or "").lower()
+        matches = sum(1 for term in query_terms if term in content)
+        if matches > 0:
+            # Boost proportional to fraction of query terms matched
+            boost = QUERY_TERM_BOOST * (matches / len(query_terms))
+            chunk.relevance_score = (chunk.relevance_score or 0) + boost
+
+
+def deduplicate_by_source(chunks: list[Chunk]) -> list[Chunk]:
+    """
+    Keep only the highest-scoring chunk per source.
+
+    This prevents multiple chunks from the same article from crowding out
+    other potentially relevant sources.
+    """
+    best_by_source: dict[int, Chunk] = {}
+    for chunk in chunks:
+        source_id = chunk.source_id
+        if source_id not in best_by_source:
+            best_by_source[source_id] = chunk
+        elif (chunk.relevance_score or 0) > (best_by_source[source_id].relevance_score or 0):
+            best_by_source[source_id] = chunk
+    return list(best_by_source.values())
+
+
+def apply_title_boost(
+    chunks: list[Chunk],
+    query_terms: set[str],
+) -> None:
+    """
+    Boost chunks when query terms match the source title.
+
+    Title matches are a strong signal since titles summarize content.
+    """
+    if not query_terms or not chunks:
+        return
+
+    # Get unique source IDs
+    source_ids = list({chunk.source_id for chunk in chunks})
+
+    # Fetch full source items (polymorphic) to access title attribute
+    with make_session() as db:
+        sources = db.query(SourceItem).filter(
+            SourceItem.id.in_(source_ids)
+        ).all()
+        titles = {s.id: (getattr(s, 'title', None) or "").lower() for s in sources}
+
+    # Apply boost to chunks whose source title matches query terms
+    for chunk in chunks:
+        title = titles.get(chunk.source_id, "")
+        if not title:
+            continue
+
+        matches = sum(1 for term in query_terms if term in title)
+        if matches > 0:
+            boost = TITLE_MATCH_BOOST * (matches / len(query_terms))
+            chunk.relevance_score = (chunk.relevance_score or 0) + boost
+
+
+def apply_popularity_boost(chunks: list[Chunk]) -> None:
+    """
+    Boost chunks based on source popularity.
+
+    Uses the popularity property from SourceItem subclasses.
+    ForumPost uses karma, others default to 1.0.
+    """
+    if not chunks:
+        return
+
+    source_ids = list({chunk.source_id for chunk in chunks})
+
+    with make_session() as db:
+        sources = db.query(SourceItem).filter(
+            SourceItem.id.in_(source_ids)
+        ).all()
+        popularity_map = {s.id: s.popularity for s in sources}
+
+    for chunk in chunks:
+        popularity = popularity_map.get(chunk.source_id, 1.0)
+        if popularity != 1.0:
+            # Apply boost: score * (1 + POPULARITY_BOOST * (popularity - 1))
+            # For popularity=2.0: multiplier = 1.02
+            # For popularity=0.5: multiplier = 0.99
+            multiplier = 1.0 + POPULARITY_BOOST * (popularity - 1.0)
+            chunk.relevance_score = (chunk.relevance_score or 0) * multiplier
+
+
+def fuse_scores_rrf(
    embedding_scores: dict[str, float],
    bm25_scores: dict[str, float],
 ) -> dict[str, float]:
    """
-    Fuse embedding and BM25 scores using weighted combination with hybrid bonus.
+    Fuse embedding and BM25 scores using Reciprocal Rank Fusion (RRF).

-    Documents appearing in both search results get a bonus, as matching both
-    semantic similarity AND keyword relevance is a strong signal.
+    RRF is more robust than weighted score combination because it uses ranks
+    rather than raw scores, making it insensitive to score scale differences.
+
+    Formula: score(d) = Σ 1/(k + rank_i(d))

    Args:
-        embedding_scores: Dict mapping chunk IDs to embedding similarity scores (0-1)
-        bm25_scores: Dict mapping chunk IDs to normalized BM25 scores (0-1)
+        embedding_scores: Dict mapping chunk IDs to embedding similarity scores
+        bm25_scores: Dict mapping chunk IDs to BM25 scores

    Returns:
-        Dict mapping chunk IDs to fused scores (0-1 range)
+        Dict mapping chunk IDs to RRF scores
    """
+    # Convert scores to ranks (1-indexed)
+    emb_ranked = sorted(embedding_scores.keys(), key=lambda x: embedding_scores[x], reverse=True)
+    bm25_ranked = sorted(bm25_scores.keys(), key=lambda x: bm25_scores[x], reverse=True)
+
+    emb_ranks = {chunk_id: rank + 1 for rank, chunk_id in enumerate(emb_ranked)}
+    bm25_ranks = {chunk_id: rank + 1 for rank, chunk_id in enumerate(bm25_ranked)}
+
+    # Compute RRF scores
    all_ids = set(embedding_scores.keys()) | set(bm25_scores.keys())
    fused: dict[str, float] = {}

    for chunk_id in all_ids:
-        emb_score = embedding_scores.get(chunk_id, 0.0)
-        bm25_score = bm25_scores.get(chunk_id, 0.0)
+        rrf_score = 0.0

-        # Check if result appears in both methods
-        in_both = chunk_id in embedding_scores and chunk_id in bm25_scores
+        if chunk_id in emb_ranks:
+            rrf_score += 1.0 / (RRF_K + emb_ranks[chunk_id])

-        # Weighted combination
-        combined = (EMBEDDING_WEIGHT * emb_score) + (BM25_WEIGHT * bm25_score)
+        if chunk_id in bm25_ranks:
+            rrf_score += 1.0 / (RRF_K + bm25_ranks[chunk_id])

-        # Add bonus for appearing in both (strong relevance signal)
-        if in_both:
-            combined = min(1.0, combined + HYBRID_BONUS)
-
-        fused[chunk_id] = combined
+        fused[chunk_id] = rrf_score

    return fused

@ -131,15 +275,20 @@ async def search_chunks(
        except asyncio.TimeoutError:
            logger.warning("BM25 search timed out, using embedding results only")

-    # Fuse scores from both methods
-    fused_scores = fuse_scores(embedding_scores, bm25_scores)
+    # Fuse scores from both methods using Reciprocal Rank Fusion
+    fused_scores = fuse_scores_rrf(embedding_scores, bm25_scores)

    if not fused_scores:
        return []

    # Sort by score and take top results
+    # If reranking is enabled, fetch more candidates for the reranker to work with
    sorted_ids = sorted(fused_scores.keys(), key=lambda x: fused_scores[x], reverse=True)
-    top_ids = sorted_ids[:limit]
+    if settings.ENABLE_RERANKING:
+        fetch_limit = limit * RERANK_CANDIDATE_MULTIPLIER
+    else:
+        fetch_limit = limit
+    top_ids = sorted_ids[:fetch_limit]

    with make_session() as db:
        chunks = (
@ -161,7 +310,32 @@ async def search_chunks(
            chunk.relevance_score = fused_scores.get(str(chunk.id), 0.0)

        db.expunge_all()
-        return chunks
+
+    # Extract query text for boosting and reranking
+    query_text = " ".join(
+        c for chunk in data for c in chunk.data if isinstance(c, str)
+    )
+
+    # Apply query term presence boost and title boost
+    if chunks and query_text.strip():
+        query_terms = extract_query_terms(query_text)
+        apply_query_term_boost(chunks, query_terms)
+        apply_title_boost(chunks, query_terms)
+
+    # Apply popularity boost (karma-based for forum posts)
+    if chunks:
+        apply_popularity_boost(chunks)
+
+    # Rerank using cross-encoder for better precision
+    if settings.ENABLE_RERANKING and chunks and query_text.strip():
+        try:
+            chunks = await rerank_chunks(
+                query_text, chunks, model=settings.RERANK_MODEL, top_k=limit
+            )
+        except Exception as e:
+            logger.warning(f"Reranking failed, using RRF order: {e}")
+
+    return chunks


 async def search_sources(
--- a/src/memory/common/db/models/source_item.py
+++ b/src/memory/common/db/models/source_item.py
@ -371,6 +371,16 @@ class SourceItem(Base):
        """Return the list of Qdrant collections this SourceItem type can be stored in."""
        return [cls.__tablename__]

+    @property
+    def popularity(self) -> float:
+        """
+        Return a popularity score for this item.
+
+        Default is 1.0. Subclasses can override to provide custom popularity
+        metrics (e.g., karma, view count, citations).
+        """
+        return 1.0
+
    @property
    def display_contents(self) -> dict | None:
        payload = self.as_payload()
--- a/src/memory/common/db/models/source_items.py
+++ b/src/memory/common/db/models/source_items.py
@ -751,6 +751,43 @@ class ForumPost(SourceItem):
        # Very sad that I didn't keep the names consistent... Qdrant doesn't allow renaming collections
        return ["forum"]

+    # Karma reference values for different forum sources.
+    # Maps URL substring to karma value representing "very popular" (~90th percentile).
+    # Posts at this karma get popularity=2.0; above caps at 2.5.
+    # Based on actual LW data: 90th %ile ≈ 100, 95th ≈ 144, 99th ≈ 275
+    KARMA_REFERENCES: dict[str, int] = {
+        "lesswrong.com": 100,  # 90th percentile from data
+        "greaterwrong.com": 100,  # LW mirror
+        "alignmentforum.org": 50,  # Smaller community
+        "forum.effectivealtruism.org": 75,
+    }
+    DEFAULT_KARMA_REFERENCE: int = 50
+
+    @property
+    def karma_reference(self) -> int:
+        """Get the karma reference for this post based on its URL."""
+        url = self.url or ""
+        for pattern, ref in self.KARMA_REFERENCES.items():
+            if pattern in url:
+                return ref
+        return self.DEFAULT_KARMA_REFERENCE
+
+    @property
+    def popularity(self) -> float:
+        """
+        Return popularity based on karma, normalized to karma_reference.
+
+        - karma <= 0: returns 0.5 to 1.0
+        - karma = karma_reference: returns 2.0
+        - karma > karma_reference: capped at 2.5
+        """
+        karma = self.karma or 0
+        if karma <= 0:
+            # Downvoted or zero karma: scale between 0.5 and 1.0
+            return max(0.5, 1.0 - abs(karma) / 100)
+        # Positive karma: linear scale up to reference, then cap
+        return min(2.5, 1.0 + karma / self.karma_reference)
+

 class MiscDoc(SourceItem):
    __tablename__ = "misc_doc"
--- a/tests/memory/api/search/test_search.py
+++ b/tests/memory/api/search/test_search.py
@ -11,10 +11,12 @@ from memory.api.search.search import (
    apply_query_term_boost,
    deduplicate_by_source,
    apply_title_boost,
+    apply_popularity_boost,
    fuse_scores_rrf,
    STOPWORDS,
    QUERY_TERM_BOOST,
    TITLE_MATCH_BOOST,
+    POPULARITY_BOOST,
    RRF_K,
 )

@ -273,6 +275,76 @@ def test_apply_title_boost_none_title(mock_make_session):
    assert chunks[0].relevance_score == 0.5


+# ============================================================================
+# apply_popularity_boost tests
+# ============================================================================
+
+
+def _make_pop_chunk(source_id: int, score: float = 0.5):
+    """Create a mock chunk for popularity boost tests."""
+    chunk = MagicMock()
+    chunk.source_id = source_id
+    chunk.relevance_score = score
+    return chunk
+
+
+@pytest.mark.parametrize(
+    "popularity,initial_score,expected_multiplier",
+    [
+        (1.0, 0.5, 1.0),  # Default popularity, no change
+        (2.0, 0.5, 1.0 + POPULARITY_BOOST),  # High popularity
+        (0.5, 0.5, 1.0 - POPULARITY_BOOST * 0.5),  # Low popularity
+        (1.5, 1.0, 1.0 + POPULARITY_BOOST * 0.5),  # Moderate popularity
+    ],
+)
+@patch("memory.api.search.search.make_session")
+def test_apply_popularity_boost(mock_make_session, popularity, initial_score, expected_multiplier):
+    """Should boost chunks based on source popularity."""
+    mock_session = MagicMock()
+    mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
+    mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
+
+    mock_source = MagicMock()
+    mock_source.id = 1
+    mock_source.popularity = popularity
+    mock_session.query.return_value.filter.return_value.all.return_value = [mock_source]
+
+    chunks = [_make_pop_chunk(1, initial_score)]
+    apply_popularity_boost(chunks)
+
+    expected = initial_score * expected_multiplier
+    assert chunks[0].relevance_score == pytest.approx(expected)
+
+
+def test_apply_popularity_boost_empty_chunks():
+    """Should handle empty chunks list."""
+    apply_popularity_boost([])  # Should not raise
+
+
+@patch("memory.api.search.search.make_session")
+def test_apply_popularity_boost_multiple_sources(mock_make_session):
+    """Should apply different boosts per source."""
+    mock_session = MagicMock()
+    mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
+    mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
+
+    source1 = MagicMock()
+    source1.id = 1
+    source1.popularity = 2.0  # High karma
+    source2 = MagicMock()
+    source2.id = 2
+    source2.popularity = 1.0  # Default
+    mock_session.query.return_value.filter.return_value.all.return_value = [source1, source2]
+
+    chunks = [_make_pop_chunk(1, 0.5), _make_pop_chunk(2, 0.5)]
+    apply_popularity_boost(chunks)
+
+    # Source 1 should be boosted
+    assert chunks[0].relevance_score == pytest.approx(0.5 * (1.0 + POPULARITY_BOOST))
+    # Source 2 should be unchanged (popularity = 1.0)
+    assert chunks[1].relevance_score == 0.5
+
+
 # ============================================================================
 # fuse_scores_rrf tests
 # ============================================================================