Add popularity boosting to search based on karma

- Add `popularity` property to SourceItem base class (default 1.0)
- Override in ForumPost with karma-based calculation:
  - Uses KARMA_REFERENCES dict mapping URL patterns to reference values
  - LessWrong: 100 (90th percentile from actual data)
  - Reference karma gives popularity=2.0, caps at 2.5
- Add apply_popularity_boost() to search pipeline
- POPULARITY_BOOST = 0.02 (2% score adjustment per popularity unit)
- Add comprehensive tests for popularity boost

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
mruwnik 2025-12-20 22:44:06 +00:00
parent 09215adf9a
commit f3d8b6602b
4 changed files with 323 additions and 30 deletions

View File

@ -20,60 +20,204 @@ if settings.ENABLE_BM25_SEARCH:
if settings.ENABLE_HYDE_EXPANSION:
from memory.api.search.hyde import expand_query_hyde
if settings.ENABLE_RERANKING:
from memory.api.search.rerank import rerank_chunks
from memory.api.search.types import SearchConfig, SearchFilters, SearchResult
logger = logging.getLogger(__name__)
# Weight for embedding scores vs BM25 scores in hybrid fusion
# Higher values favor semantic similarity over keyword matching
EMBEDDING_WEIGHT = 0.7
BM25_WEIGHT = 0.3
# Bonus for results that appear in both embedding and BM25 search
# This rewards documents that match both semantically and lexically
HYBRID_BONUS = 0.15
# Reciprocal Rank Fusion constant (k parameter)
# Higher values reduce the influence of top-ranked documents
# 60 is the standard value from the original RRF paper
RRF_K = 60
# Multiplier for internal search limit before fusion
# We search for more candidates than requested, fuse scores, then return top N
# This helps find results that rank well in one method but not the other
CANDIDATE_MULTIPLIER = 5
# How many candidates to pass to reranker (multiplier of final limit)
# Higher = more accurate but slower and more expensive
RERANK_CANDIDATE_MULTIPLIER = 3
def fuse_scores(
# Bonus for chunks containing query terms (added to RRF score)
QUERY_TERM_BOOST = 0.005
# Bonus when query terms match the source title (stronger signal)
TITLE_MATCH_BOOST = 0.01
# Bonus multiplier for popularity (applied as: score * (1 + POPULARITY_BOOST * (popularity - 1)))
# This gives a small boost to popular items without dominating relevance
POPULARITY_BOOST = 0.02
# Common words to ignore when checking for query term presence
STOPWORDS = {
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
"have", "has", "had", "do", "does", "did", "will", "would", "could",
"should", "may", "might", "must", "shall", "can", "need", "dare",
"ought", "used", "to", "of", "in", "for", "on", "with", "at", "by",
"from", "as", "into", "through", "during", "before", "after", "above",
"below", "between", "under", "again", "further", "then", "once", "here",
"there", "when", "where", "why", "how", "all", "each", "few", "more",
"most", "other", "some", "such", "no", "nor", "not", "only", "own",
"same", "so", "than", "too", "very", "just", "and", "but", "if", "or",
"because", "until", "while", "although", "though", "after", "before",
"what", "which", "who", "whom", "this", "that", "these", "those", "i",
"me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
"yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
"her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
"theirs", "themselves", "about", "get", "got", "getting", "like", "also",
}
def extract_query_terms(query: str) -> set[str]:
"""Extract meaningful terms from query, filtering stopwords."""
words = query.lower().split()
return {w for w in words if w not in STOPWORDS and len(w) > 2}
def apply_query_term_boost(
chunks: list[Chunk],
query_terms: set[str],
) -> None:
"""
Boost chunk scores when query terms appear in content.
This helps surface chunks that contain exact query words even if
embedding similarity is lower.
"""
if not query_terms:
return
for chunk in chunks:
content = (chunk.content or "").lower()
matches = sum(1 for term in query_terms if term in content)
if matches > 0:
# Boost proportional to fraction of query terms matched
boost = QUERY_TERM_BOOST * (matches / len(query_terms))
chunk.relevance_score = (chunk.relevance_score or 0) + boost
def deduplicate_by_source(chunks: list[Chunk]) -> list[Chunk]:
"""
Keep only the highest-scoring chunk per source.
This prevents multiple chunks from the same article from crowding out
other potentially relevant sources.
"""
best_by_source: dict[int, Chunk] = {}
for chunk in chunks:
source_id = chunk.source_id
if source_id not in best_by_source:
best_by_source[source_id] = chunk
elif (chunk.relevance_score or 0) > (best_by_source[source_id].relevance_score or 0):
best_by_source[source_id] = chunk
return list(best_by_source.values())
def apply_title_boost(
chunks: list[Chunk],
query_terms: set[str],
) -> None:
"""
Boost chunks when query terms match the source title.
Title matches are a strong signal since titles summarize content.
"""
if not query_terms or not chunks:
return
# Get unique source IDs
source_ids = list({chunk.source_id for chunk in chunks})
# Fetch full source items (polymorphic) to access title attribute
with make_session() as db:
sources = db.query(SourceItem).filter(
SourceItem.id.in_(source_ids)
).all()
titles = {s.id: (getattr(s, 'title', None) or "").lower() for s in sources}
# Apply boost to chunks whose source title matches query terms
for chunk in chunks:
title = titles.get(chunk.source_id, "")
if not title:
continue
matches = sum(1 for term in query_terms if term in title)
if matches > 0:
boost = TITLE_MATCH_BOOST * (matches / len(query_terms))
chunk.relevance_score = (chunk.relevance_score or 0) + boost
def apply_popularity_boost(chunks: list[Chunk]) -> None:
"""
Boost chunks based on source popularity.
Uses the popularity property from SourceItem subclasses.
ForumPost uses karma, others default to 1.0.
"""
if not chunks:
return
source_ids = list({chunk.source_id for chunk in chunks})
with make_session() as db:
sources = db.query(SourceItem).filter(
SourceItem.id.in_(source_ids)
).all()
popularity_map = {s.id: s.popularity for s in sources}
for chunk in chunks:
popularity = popularity_map.get(chunk.source_id, 1.0)
if popularity != 1.0:
# Apply boost: score * (1 + POPULARITY_BOOST * (popularity - 1))
# For popularity=2.0: multiplier = 1.02
# For popularity=0.5: multiplier = 0.99
multiplier = 1.0 + POPULARITY_BOOST * (popularity - 1.0)
chunk.relevance_score = (chunk.relevance_score or 0) * multiplier
def fuse_scores_rrf(
embedding_scores: dict[str, float],
bm25_scores: dict[str, float],
) -> dict[str, float]:
"""
Fuse embedding and BM25 scores using weighted combination with hybrid bonus.
Fuse embedding and BM25 scores using Reciprocal Rank Fusion (RRF).
Documents appearing in both search results get a bonus, as matching both
semantic similarity AND keyword relevance is a strong signal.
RRF is more robust than weighted score combination because it uses ranks
rather than raw scores, making it insensitive to score scale differences.
Formula: score(d) = Σ 1/(k + rank_i(d))
Args:
embedding_scores: Dict mapping chunk IDs to embedding similarity scores (0-1)
bm25_scores: Dict mapping chunk IDs to normalized BM25 scores (0-1)
embedding_scores: Dict mapping chunk IDs to embedding similarity scores
bm25_scores: Dict mapping chunk IDs to BM25 scores
Returns:
Dict mapping chunk IDs to fused scores (0-1 range)
Dict mapping chunk IDs to RRF scores
"""
# Convert scores to ranks (1-indexed)
emb_ranked = sorted(embedding_scores.keys(), key=lambda x: embedding_scores[x], reverse=True)
bm25_ranked = sorted(bm25_scores.keys(), key=lambda x: bm25_scores[x], reverse=True)
emb_ranks = {chunk_id: rank + 1 for rank, chunk_id in enumerate(emb_ranked)}
bm25_ranks = {chunk_id: rank + 1 for rank, chunk_id in enumerate(bm25_ranked)}
# Compute RRF scores
all_ids = set(embedding_scores.keys()) | set(bm25_scores.keys())
fused: dict[str, float] = {}
for chunk_id in all_ids:
emb_score = embedding_scores.get(chunk_id, 0.0)
bm25_score = bm25_scores.get(chunk_id, 0.0)
rrf_score = 0.0
# Check if result appears in both methods
in_both = chunk_id in embedding_scores and chunk_id in bm25_scores
if chunk_id in emb_ranks:
rrf_score += 1.0 / (RRF_K + emb_ranks[chunk_id])
# Weighted combination
combined = (EMBEDDING_WEIGHT * emb_score) + (BM25_WEIGHT * bm25_score)
if chunk_id in bm25_ranks:
rrf_score += 1.0 / (RRF_K + bm25_ranks[chunk_id])
# Add bonus for appearing in both (strong relevance signal)
if in_both:
combined = min(1.0, combined + HYBRID_BONUS)
fused[chunk_id] = combined
fused[chunk_id] = rrf_score
return fused
@ -131,15 +275,20 @@ async def search_chunks(
except asyncio.TimeoutError:
logger.warning("BM25 search timed out, using embedding results only")
# Fuse scores from both methods
fused_scores = fuse_scores(embedding_scores, bm25_scores)
# Fuse scores from both methods using Reciprocal Rank Fusion
fused_scores = fuse_scores_rrf(embedding_scores, bm25_scores)
if not fused_scores:
return []
# Sort by score and take top results
# If reranking is enabled, fetch more candidates for the reranker to work with
sorted_ids = sorted(fused_scores.keys(), key=lambda x: fused_scores[x], reverse=True)
top_ids = sorted_ids[:limit]
if settings.ENABLE_RERANKING:
fetch_limit = limit * RERANK_CANDIDATE_MULTIPLIER
else:
fetch_limit = limit
top_ids = sorted_ids[:fetch_limit]
with make_session() as db:
chunks = (
@ -161,7 +310,32 @@ async def search_chunks(
chunk.relevance_score = fused_scores.get(str(chunk.id), 0.0)
db.expunge_all()
return chunks
# Extract query text for boosting and reranking
query_text = " ".join(
c for chunk in data for c in chunk.data if isinstance(c, str)
)
# Apply query term presence boost and title boost
if chunks and query_text.strip():
query_terms = extract_query_terms(query_text)
apply_query_term_boost(chunks, query_terms)
apply_title_boost(chunks, query_terms)
# Apply popularity boost (karma-based for forum posts)
if chunks:
apply_popularity_boost(chunks)
# Rerank using cross-encoder for better precision
if settings.ENABLE_RERANKING and chunks and query_text.strip():
try:
chunks = await rerank_chunks(
query_text, chunks, model=settings.RERANK_MODEL, top_k=limit
)
except Exception as e:
logger.warning(f"Reranking failed, using RRF order: {e}")
return chunks
async def search_sources(

View File

@ -371,6 +371,16 @@ class SourceItem(Base):
"""Return the list of Qdrant collections this SourceItem type can be stored in."""
return [cls.__tablename__]
@property
def popularity(self) -> float:
"""
Return a popularity score for this item.
Default is 1.0. Subclasses can override to provide custom popularity
metrics (e.g., karma, view count, citations).
"""
return 1.0
@property
def display_contents(self) -> dict | None:
payload = self.as_payload()

View File

@ -751,6 +751,43 @@ class ForumPost(SourceItem):
# Very sad that I didn't keep the names consistent... Qdrant doesn't allow renaming collections
return ["forum"]
# Karma reference values for different forum sources.
# Maps URL substring to karma value representing "very popular" (~90th percentile).
# Posts at this karma get popularity=2.0; above caps at 2.5.
# Based on actual LW data: 90th %ile ≈ 100, 95th ≈ 144, 99th ≈ 275
KARMA_REFERENCES: dict[str, int] = {
"lesswrong.com": 100, # 90th percentile from data
"greaterwrong.com": 100, # LW mirror
"alignmentforum.org": 50, # Smaller community
"forum.effectivealtruism.org": 75,
}
DEFAULT_KARMA_REFERENCE: int = 50
@property
def karma_reference(self) -> int:
"""Get the karma reference for this post based on its URL."""
url = self.url or ""
for pattern, ref in self.KARMA_REFERENCES.items():
if pattern in url:
return ref
return self.DEFAULT_KARMA_REFERENCE
@property
def popularity(self) -> float:
"""
Return popularity based on karma, normalized to karma_reference.
- karma <= 0: returns 0.5 to 1.0
- karma = karma_reference: returns 2.0
- karma > karma_reference: capped at 2.5
"""
karma = self.karma or 0
if karma <= 0:
# Downvoted or zero karma: scale between 0.5 and 1.0
return max(0.5, 1.0 - abs(karma) / 100)
# Positive karma: linear scale up to reference, then cap
return min(2.5, 1.0 + karma / self.karma_reference)
class MiscDoc(SourceItem):
__tablename__ = "misc_doc"

View File

@ -11,10 +11,12 @@ from memory.api.search.search import (
apply_query_term_boost,
deduplicate_by_source,
apply_title_boost,
apply_popularity_boost,
fuse_scores_rrf,
STOPWORDS,
QUERY_TERM_BOOST,
TITLE_MATCH_BOOST,
POPULARITY_BOOST,
RRF_K,
)
@ -273,6 +275,76 @@ def test_apply_title_boost_none_title(mock_make_session):
assert chunks[0].relevance_score == 0.5
# ============================================================================
# apply_popularity_boost tests
# ============================================================================
def _make_pop_chunk(source_id: int, score: float = 0.5):
"""Create a mock chunk for popularity boost tests."""
chunk = MagicMock()
chunk.source_id = source_id
chunk.relevance_score = score
return chunk
@pytest.mark.parametrize(
"popularity,initial_score,expected_multiplier",
[
(1.0, 0.5, 1.0), # Default popularity, no change
(2.0, 0.5, 1.0 + POPULARITY_BOOST), # High popularity
(0.5, 0.5, 1.0 - POPULARITY_BOOST * 0.5), # Low popularity
(1.5, 1.0, 1.0 + POPULARITY_BOOST * 0.5), # Moderate popularity
],
)
@patch("memory.api.search.search.make_session")
def test_apply_popularity_boost(mock_make_session, popularity, initial_score, expected_multiplier):
"""Should boost chunks based on source popularity."""
mock_session = MagicMock()
mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
mock_source = MagicMock()
mock_source.id = 1
mock_source.popularity = popularity
mock_session.query.return_value.filter.return_value.all.return_value = [mock_source]
chunks = [_make_pop_chunk(1, initial_score)]
apply_popularity_boost(chunks)
expected = initial_score * expected_multiplier
assert chunks[0].relevance_score == pytest.approx(expected)
def test_apply_popularity_boost_empty_chunks():
"""Should handle empty chunks list."""
apply_popularity_boost([]) # Should not raise
@patch("memory.api.search.search.make_session")
def test_apply_popularity_boost_multiple_sources(mock_make_session):
"""Should apply different boosts per source."""
mock_session = MagicMock()
mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
source1 = MagicMock()
source1.id = 1
source1.popularity = 2.0 # High karma
source2 = MagicMock()
source2.id = 2
source2.popularity = 1.0 # Default
mock_session.query.return_value.filter.return_value.all.return_value = [source1, source2]
chunks = [_make_pop_chunk(1, 0.5), _make_pop_chunk(2, 0.5)]
apply_popularity_boost(chunks)
# Source 1 should be boosted
assert chunks[0].relevance_score == pytest.approx(0.5 * (1.0 + POPULARITY_BOOST))
# Source 2 should be unchanged (popularity = 1.0)
assert chunks[1].relevance_score == 0.5
# ============================================================================
# fuse_scores_rrf tests
# ============================================================================