mirror of
https://github.com/mruwnik/memory.git
synced 2026-01-02 17:22:58 +01:00
Add popularity boosting to search based on karma
- Add `popularity` property to SourceItem base class (default 1.0) - Override in ForumPost with karma-based calculation: - Uses KARMA_REFERENCES dict mapping URL patterns to reference values - LessWrong: 100 (90th percentile from actual data) - Reference karma gives popularity=2.0, caps at 2.5 - Add apply_popularity_boost() to search pipeline - POPULARITY_BOOST = 0.02 (2% score adjustment per popularity unit) - Add comprehensive tests for popularity boost 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
09215adf9a
commit
f3d8b6602b
@ -20,60 +20,204 @@ if settings.ENABLE_BM25_SEARCH:
|
|||||||
if settings.ENABLE_HYDE_EXPANSION:
|
if settings.ENABLE_HYDE_EXPANSION:
|
||||||
from memory.api.search.hyde import expand_query_hyde
|
from memory.api.search.hyde import expand_query_hyde
|
||||||
|
|
||||||
|
if settings.ENABLE_RERANKING:
|
||||||
|
from memory.api.search.rerank import rerank_chunks
|
||||||
|
|
||||||
from memory.api.search.types import SearchConfig, SearchFilters, SearchResult
|
from memory.api.search.types import SearchConfig, SearchFilters, SearchResult
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Weight for embedding scores vs BM25 scores in hybrid fusion
|
# Reciprocal Rank Fusion constant (k parameter)
|
||||||
# Higher values favor semantic similarity over keyword matching
|
# Higher values reduce the influence of top-ranked documents
|
||||||
EMBEDDING_WEIGHT = 0.7
|
# 60 is the standard value from the original RRF paper
|
||||||
BM25_WEIGHT = 0.3
|
RRF_K = 60
|
||||||
|
|
||||||
# Bonus for results that appear in both embedding and BM25 search
|
|
||||||
# This rewards documents that match both semantically and lexically
|
|
||||||
HYBRID_BONUS = 0.15
|
|
||||||
|
|
||||||
# Multiplier for internal search limit before fusion
|
# Multiplier for internal search limit before fusion
|
||||||
# We search for more candidates than requested, fuse scores, then return top N
|
# We search for more candidates than requested, fuse scores, then return top N
|
||||||
# This helps find results that rank well in one method but not the other
|
# This helps find results that rank well in one method but not the other
|
||||||
CANDIDATE_MULTIPLIER = 5
|
CANDIDATE_MULTIPLIER = 5
|
||||||
|
|
||||||
|
# How many candidates to pass to reranker (multiplier of final limit)
|
||||||
|
# Higher = more accurate but slower and more expensive
|
||||||
|
RERANK_CANDIDATE_MULTIPLIER = 3
|
||||||
|
|
||||||
def fuse_scores(
|
# Bonus for chunks containing query terms (added to RRF score)
|
||||||
|
QUERY_TERM_BOOST = 0.005
|
||||||
|
|
||||||
|
# Bonus when query terms match the source title (stronger signal)
|
||||||
|
TITLE_MATCH_BOOST = 0.01
|
||||||
|
|
||||||
|
# Bonus multiplier for popularity (applied as: score * (1 + POPULARITY_BOOST * (popularity - 1)))
|
||||||
|
# This gives a small boost to popular items without dominating relevance
|
||||||
|
POPULARITY_BOOST = 0.02
|
||||||
|
|
||||||
|
# Common words to ignore when checking for query term presence
|
||||||
|
STOPWORDS = {
|
||||||
|
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
|
||||||
|
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
||||||
|
"should", "may", "might", "must", "shall", "can", "need", "dare",
|
||||||
|
"ought", "used", "to", "of", "in", "for", "on", "with", "at", "by",
|
||||||
|
"from", "as", "into", "through", "during", "before", "after", "above",
|
||||||
|
"below", "between", "under", "again", "further", "then", "once", "here",
|
||||||
|
"there", "when", "where", "why", "how", "all", "each", "few", "more",
|
||||||
|
"most", "other", "some", "such", "no", "nor", "not", "only", "own",
|
||||||
|
"same", "so", "than", "too", "very", "just", "and", "but", "if", "or",
|
||||||
|
"because", "until", "while", "although", "though", "after", "before",
|
||||||
|
"what", "which", "who", "whom", "this", "that", "these", "those", "i",
|
||||||
|
"me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
|
||||||
|
"yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
|
||||||
|
"her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
|
||||||
|
"theirs", "themselves", "about", "get", "got", "getting", "like", "also",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_query_terms(query: str) -> set[str]:
|
||||||
|
"""Extract meaningful terms from query, filtering stopwords."""
|
||||||
|
words = query.lower().split()
|
||||||
|
return {w for w in words if w not in STOPWORDS and len(w) > 2}
|
||||||
|
|
||||||
|
|
||||||
|
def apply_query_term_boost(
|
||||||
|
chunks: list[Chunk],
|
||||||
|
query_terms: set[str],
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Boost chunk scores when query terms appear in content.
|
||||||
|
|
||||||
|
This helps surface chunks that contain exact query words even if
|
||||||
|
embedding similarity is lower.
|
||||||
|
"""
|
||||||
|
if not query_terms:
|
||||||
|
return
|
||||||
|
|
||||||
|
for chunk in chunks:
|
||||||
|
content = (chunk.content or "").lower()
|
||||||
|
matches = sum(1 for term in query_terms if term in content)
|
||||||
|
if matches > 0:
|
||||||
|
# Boost proportional to fraction of query terms matched
|
||||||
|
boost = QUERY_TERM_BOOST * (matches / len(query_terms))
|
||||||
|
chunk.relevance_score = (chunk.relevance_score or 0) + boost
|
||||||
|
|
||||||
|
|
||||||
|
def deduplicate_by_source(chunks: list[Chunk]) -> list[Chunk]:
|
||||||
|
"""
|
||||||
|
Keep only the highest-scoring chunk per source.
|
||||||
|
|
||||||
|
This prevents multiple chunks from the same article from crowding out
|
||||||
|
other potentially relevant sources.
|
||||||
|
"""
|
||||||
|
best_by_source: dict[int, Chunk] = {}
|
||||||
|
for chunk in chunks:
|
||||||
|
source_id = chunk.source_id
|
||||||
|
if source_id not in best_by_source:
|
||||||
|
best_by_source[source_id] = chunk
|
||||||
|
elif (chunk.relevance_score or 0) > (best_by_source[source_id].relevance_score or 0):
|
||||||
|
best_by_source[source_id] = chunk
|
||||||
|
return list(best_by_source.values())
|
||||||
|
|
||||||
|
|
||||||
|
def apply_title_boost(
|
||||||
|
chunks: list[Chunk],
|
||||||
|
query_terms: set[str],
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Boost chunks when query terms match the source title.
|
||||||
|
|
||||||
|
Title matches are a strong signal since titles summarize content.
|
||||||
|
"""
|
||||||
|
if not query_terms or not chunks:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Get unique source IDs
|
||||||
|
source_ids = list({chunk.source_id for chunk in chunks})
|
||||||
|
|
||||||
|
# Fetch full source items (polymorphic) to access title attribute
|
||||||
|
with make_session() as db:
|
||||||
|
sources = db.query(SourceItem).filter(
|
||||||
|
SourceItem.id.in_(source_ids)
|
||||||
|
).all()
|
||||||
|
titles = {s.id: (getattr(s, 'title', None) or "").lower() for s in sources}
|
||||||
|
|
||||||
|
# Apply boost to chunks whose source title matches query terms
|
||||||
|
for chunk in chunks:
|
||||||
|
title = titles.get(chunk.source_id, "")
|
||||||
|
if not title:
|
||||||
|
continue
|
||||||
|
|
||||||
|
matches = sum(1 for term in query_terms if term in title)
|
||||||
|
if matches > 0:
|
||||||
|
boost = TITLE_MATCH_BOOST * (matches / len(query_terms))
|
||||||
|
chunk.relevance_score = (chunk.relevance_score or 0) + boost
|
||||||
|
|
||||||
|
|
||||||
|
def apply_popularity_boost(chunks: list[Chunk]) -> None:
|
||||||
|
"""
|
||||||
|
Boost chunks based on source popularity.
|
||||||
|
|
||||||
|
Uses the popularity property from SourceItem subclasses.
|
||||||
|
ForumPost uses karma, others default to 1.0.
|
||||||
|
"""
|
||||||
|
if not chunks:
|
||||||
|
return
|
||||||
|
|
||||||
|
source_ids = list({chunk.source_id for chunk in chunks})
|
||||||
|
|
||||||
|
with make_session() as db:
|
||||||
|
sources = db.query(SourceItem).filter(
|
||||||
|
SourceItem.id.in_(source_ids)
|
||||||
|
).all()
|
||||||
|
popularity_map = {s.id: s.popularity for s in sources}
|
||||||
|
|
||||||
|
for chunk in chunks:
|
||||||
|
popularity = popularity_map.get(chunk.source_id, 1.0)
|
||||||
|
if popularity != 1.0:
|
||||||
|
# Apply boost: score * (1 + POPULARITY_BOOST * (popularity - 1))
|
||||||
|
# For popularity=2.0: multiplier = 1.02
|
||||||
|
# For popularity=0.5: multiplier = 0.99
|
||||||
|
multiplier = 1.0 + POPULARITY_BOOST * (popularity - 1.0)
|
||||||
|
chunk.relevance_score = (chunk.relevance_score or 0) * multiplier
|
||||||
|
|
||||||
|
|
||||||
|
def fuse_scores_rrf(
|
||||||
embedding_scores: dict[str, float],
|
embedding_scores: dict[str, float],
|
||||||
bm25_scores: dict[str, float],
|
bm25_scores: dict[str, float],
|
||||||
) -> dict[str, float]:
|
) -> dict[str, float]:
|
||||||
"""
|
"""
|
||||||
Fuse embedding and BM25 scores using weighted combination with hybrid bonus.
|
Fuse embedding and BM25 scores using Reciprocal Rank Fusion (RRF).
|
||||||
|
|
||||||
Documents appearing in both search results get a bonus, as matching both
|
RRF is more robust than weighted score combination because it uses ranks
|
||||||
semantic similarity AND keyword relevance is a strong signal.
|
rather than raw scores, making it insensitive to score scale differences.
|
||||||
|
|
||||||
|
Formula: score(d) = Σ 1/(k + rank_i(d))
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
embedding_scores: Dict mapping chunk IDs to embedding similarity scores (0-1)
|
embedding_scores: Dict mapping chunk IDs to embedding similarity scores
|
||||||
bm25_scores: Dict mapping chunk IDs to normalized BM25 scores (0-1)
|
bm25_scores: Dict mapping chunk IDs to BM25 scores
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict mapping chunk IDs to fused scores (0-1 range)
|
Dict mapping chunk IDs to RRF scores
|
||||||
"""
|
"""
|
||||||
|
# Convert scores to ranks (1-indexed)
|
||||||
|
emb_ranked = sorted(embedding_scores.keys(), key=lambda x: embedding_scores[x], reverse=True)
|
||||||
|
bm25_ranked = sorted(bm25_scores.keys(), key=lambda x: bm25_scores[x], reverse=True)
|
||||||
|
|
||||||
|
emb_ranks = {chunk_id: rank + 1 for rank, chunk_id in enumerate(emb_ranked)}
|
||||||
|
bm25_ranks = {chunk_id: rank + 1 for rank, chunk_id in enumerate(bm25_ranked)}
|
||||||
|
|
||||||
|
# Compute RRF scores
|
||||||
all_ids = set(embedding_scores.keys()) | set(bm25_scores.keys())
|
all_ids = set(embedding_scores.keys()) | set(bm25_scores.keys())
|
||||||
fused: dict[str, float] = {}
|
fused: dict[str, float] = {}
|
||||||
|
|
||||||
for chunk_id in all_ids:
|
for chunk_id in all_ids:
|
||||||
emb_score = embedding_scores.get(chunk_id, 0.0)
|
rrf_score = 0.0
|
||||||
bm25_score = bm25_scores.get(chunk_id, 0.0)
|
|
||||||
|
|
||||||
# Check if result appears in both methods
|
if chunk_id in emb_ranks:
|
||||||
in_both = chunk_id in embedding_scores and chunk_id in bm25_scores
|
rrf_score += 1.0 / (RRF_K + emb_ranks[chunk_id])
|
||||||
|
|
||||||
# Weighted combination
|
if chunk_id in bm25_ranks:
|
||||||
combined = (EMBEDDING_WEIGHT * emb_score) + (BM25_WEIGHT * bm25_score)
|
rrf_score += 1.0 / (RRF_K + bm25_ranks[chunk_id])
|
||||||
|
|
||||||
# Add bonus for appearing in both (strong relevance signal)
|
fused[chunk_id] = rrf_score
|
||||||
if in_both:
|
|
||||||
combined = min(1.0, combined + HYBRID_BONUS)
|
|
||||||
|
|
||||||
fused[chunk_id] = combined
|
|
||||||
|
|
||||||
return fused
|
return fused
|
||||||
|
|
||||||
@ -131,15 +275,20 @@ async def search_chunks(
|
|||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
logger.warning("BM25 search timed out, using embedding results only")
|
logger.warning("BM25 search timed out, using embedding results only")
|
||||||
|
|
||||||
# Fuse scores from both methods
|
# Fuse scores from both methods using Reciprocal Rank Fusion
|
||||||
fused_scores = fuse_scores(embedding_scores, bm25_scores)
|
fused_scores = fuse_scores_rrf(embedding_scores, bm25_scores)
|
||||||
|
|
||||||
if not fused_scores:
|
if not fused_scores:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Sort by score and take top results
|
# Sort by score and take top results
|
||||||
|
# If reranking is enabled, fetch more candidates for the reranker to work with
|
||||||
sorted_ids = sorted(fused_scores.keys(), key=lambda x: fused_scores[x], reverse=True)
|
sorted_ids = sorted(fused_scores.keys(), key=lambda x: fused_scores[x], reverse=True)
|
||||||
top_ids = sorted_ids[:limit]
|
if settings.ENABLE_RERANKING:
|
||||||
|
fetch_limit = limit * RERANK_CANDIDATE_MULTIPLIER
|
||||||
|
else:
|
||||||
|
fetch_limit = limit
|
||||||
|
top_ids = sorted_ids[:fetch_limit]
|
||||||
|
|
||||||
with make_session() as db:
|
with make_session() as db:
|
||||||
chunks = (
|
chunks = (
|
||||||
@ -161,6 +310,31 @@ async def search_chunks(
|
|||||||
chunk.relevance_score = fused_scores.get(str(chunk.id), 0.0)
|
chunk.relevance_score = fused_scores.get(str(chunk.id), 0.0)
|
||||||
|
|
||||||
db.expunge_all()
|
db.expunge_all()
|
||||||
|
|
||||||
|
# Extract query text for boosting and reranking
|
||||||
|
query_text = " ".join(
|
||||||
|
c for chunk in data for c in chunk.data if isinstance(c, str)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Apply query term presence boost and title boost
|
||||||
|
if chunks and query_text.strip():
|
||||||
|
query_terms = extract_query_terms(query_text)
|
||||||
|
apply_query_term_boost(chunks, query_terms)
|
||||||
|
apply_title_boost(chunks, query_terms)
|
||||||
|
|
||||||
|
# Apply popularity boost (karma-based for forum posts)
|
||||||
|
if chunks:
|
||||||
|
apply_popularity_boost(chunks)
|
||||||
|
|
||||||
|
# Rerank using cross-encoder for better precision
|
||||||
|
if settings.ENABLE_RERANKING and chunks and query_text.strip():
|
||||||
|
try:
|
||||||
|
chunks = await rerank_chunks(
|
||||||
|
query_text, chunks, model=settings.RERANK_MODEL, top_k=limit
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Reranking failed, using RRF order: {e}")
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -371,6 +371,16 @@ class SourceItem(Base):
|
|||||||
"""Return the list of Qdrant collections this SourceItem type can be stored in."""
|
"""Return the list of Qdrant collections this SourceItem type can be stored in."""
|
||||||
return [cls.__tablename__]
|
return [cls.__tablename__]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def popularity(self) -> float:
|
||||||
|
"""
|
||||||
|
Return a popularity score for this item.
|
||||||
|
|
||||||
|
Default is 1.0. Subclasses can override to provide custom popularity
|
||||||
|
metrics (e.g., karma, view count, citations).
|
||||||
|
"""
|
||||||
|
return 1.0
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def display_contents(self) -> dict | None:
|
def display_contents(self) -> dict | None:
|
||||||
payload = self.as_payload()
|
payload = self.as_payload()
|
||||||
|
|||||||
@ -751,6 +751,43 @@ class ForumPost(SourceItem):
|
|||||||
# Very sad that I didn't keep the names consistent... Qdrant doesn't allow renaming collections
|
# Very sad that I didn't keep the names consistent... Qdrant doesn't allow renaming collections
|
||||||
return ["forum"]
|
return ["forum"]
|
||||||
|
|
||||||
|
# Karma reference values for different forum sources.
|
||||||
|
# Maps URL substring to karma value representing "very popular" (~90th percentile).
|
||||||
|
# Posts at this karma get popularity=2.0; above caps at 2.5.
|
||||||
|
# Based on actual LW data: 90th %ile ≈ 100, 95th ≈ 144, 99th ≈ 275
|
||||||
|
KARMA_REFERENCES: dict[str, int] = {
|
||||||
|
"lesswrong.com": 100, # 90th percentile from data
|
||||||
|
"greaterwrong.com": 100, # LW mirror
|
||||||
|
"alignmentforum.org": 50, # Smaller community
|
||||||
|
"forum.effectivealtruism.org": 75,
|
||||||
|
}
|
||||||
|
DEFAULT_KARMA_REFERENCE: int = 50
|
||||||
|
|
||||||
|
@property
|
||||||
|
def karma_reference(self) -> int:
|
||||||
|
"""Get the karma reference for this post based on its URL."""
|
||||||
|
url = self.url or ""
|
||||||
|
for pattern, ref in self.KARMA_REFERENCES.items():
|
||||||
|
if pattern in url:
|
||||||
|
return ref
|
||||||
|
return self.DEFAULT_KARMA_REFERENCE
|
||||||
|
|
||||||
|
@property
|
||||||
|
def popularity(self) -> float:
|
||||||
|
"""
|
||||||
|
Return popularity based on karma, normalized to karma_reference.
|
||||||
|
|
||||||
|
- karma <= 0: returns 0.5 to 1.0
|
||||||
|
- karma = karma_reference: returns 2.0
|
||||||
|
- karma > karma_reference: capped at 2.5
|
||||||
|
"""
|
||||||
|
karma = self.karma or 0
|
||||||
|
if karma <= 0:
|
||||||
|
# Downvoted or zero karma: scale between 0.5 and 1.0
|
||||||
|
return max(0.5, 1.0 - abs(karma) / 100)
|
||||||
|
# Positive karma: linear scale up to reference, then cap
|
||||||
|
return min(2.5, 1.0 + karma / self.karma_reference)
|
||||||
|
|
||||||
|
|
||||||
class MiscDoc(SourceItem):
|
class MiscDoc(SourceItem):
|
||||||
__tablename__ = "misc_doc"
|
__tablename__ = "misc_doc"
|
||||||
|
|||||||
@ -11,10 +11,12 @@ from memory.api.search.search import (
|
|||||||
apply_query_term_boost,
|
apply_query_term_boost,
|
||||||
deduplicate_by_source,
|
deduplicate_by_source,
|
||||||
apply_title_boost,
|
apply_title_boost,
|
||||||
|
apply_popularity_boost,
|
||||||
fuse_scores_rrf,
|
fuse_scores_rrf,
|
||||||
STOPWORDS,
|
STOPWORDS,
|
||||||
QUERY_TERM_BOOST,
|
QUERY_TERM_BOOST,
|
||||||
TITLE_MATCH_BOOST,
|
TITLE_MATCH_BOOST,
|
||||||
|
POPULARITY_BOOST,
|
||||||
RRF_K,
|
RRF_K,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -273,6 +275,76 @@ def test_apply_title_boost_none_title(mock_make_session):
|
|||||||
assert chunks[0].relevance_score == 0.5
|
assert chunks[0].relevance_score == 0.5
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# apply_popularity_boost tests
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def _make_pop_chunk(source_id: int, score: float = 0.5):
|
||||||
|
"""Create a mock chunk for popularity boost tests."""
|
||||||
|
chunk = MagicMock()
|
||||||
|
chunk.source_id = source_id
|
||||||
|
chunk.relevance_score = score
|
||||||
|
return chunk
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"popularity,initial_score,expected_multiplier",
|
||||||
|
[
|
||||||
|
(1.0, 0.5, 1.0), # Default popularity, no change
|
||||||
|
(2.0, 0.5, 1.0 + POPULARITY_BOOST), # High popularity
|
||||||
|
(0.5, 0.5, 1.0 - POPULARITY_BOOST * 0.5), # Low popularity
|
||||||
|
(1.5, 1.0, 1.0 + POPULARITY_BOOST * 0.5), # Moderate popularity
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@patch("memory.api.search.search.make_session")
|
||||||
|
def test_apply_popularity_boost(mock_make_session, popularity, initial_score, expected_multiplier):
|
||||||
|
"""Should boost chunks based on source popularity."""
|
||||||
|
mock_session = MagicMock()
|
||||||
|
mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
|
||||||
|
mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
|
||||||
|
|
||||||
|
mock_source = MagicMock()
|
||||||
|
mock_source.id = 1
|
||||||
|
mock_source.popularity = popularity
|
||||||
|
mock_session.query.return_value.filter.return_value.all.return_value = [mock_source]
|
||||||
|
|
||||||
|
chunks = [_make_pop_chunk(1, initial_score)]
|
||||||
|
apply_popularity_boost(chunks)
|
||||||
|
|
||||||
|
expected = initial_score * expected_multiplier
|
||||||
|
assert chunks[0].relevance_score == pytest.approx(expected)
|
||||||
|
|
||||||
|
|
||||||
|
def test_apply_popularity_boost_empty_chunks():
|
||||||
|
"""Should handle empty chunks list."""
|
||||||
|
apply_popularity_boost([]) # Should not raise
|
||||||
|
|
||||||
|
|
||||||
|
@patch("memory.api.search.search.make_session")
|
||||||
|
def test_apply_popularity_boost_multiple_sources(mock_make_session):
|
||||||
|
"""Should apply different boosts per source."""
|
||||||
|
mock_session = MagicMock()
|
||||||
|
mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
|
||||||
|
mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
|
||||||
|
|
||||||
|
source1 = MagicMock()
|
||||||
|
source1.id = 1
|
||||||
|
source1.popularity = 2.0 # High karma
|
||||||
|
source2 = MagicMock()
|
||||||
|
source2.id = 2
|
||||||
|
source2.popularity = 1.0 # Default
|
||||||
|
mock_session.query.return_value.filter.return_value.all.return_value = [source1, source2]
|
||||||
|
|
||||||
|
chunks = [_make_pop_chunk(1, 0.5), _make_pop_chunk(2, 0.5)]
|
||||||
|
apply_popularity_boost(chunks)
|
||||||
|
|
||||||
|
# Source 1 should be boosted
|
||||||
|
assert chunks[0].relevance_score == pytest.approx(0.5 * (1.0 + POPULARITY_BOOST))
|
||||||
|
# Source 2 should be unchanged (popularity = 1.0)
|
||||||
|
assert chunks[1].relevance_score == 0.5
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# fuse_scores_rrf tests
|
# fuse_scores_rrf tests
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user