memory/tests/memory/api/search/test_search.py
mruwnik 5b997cc397 Fix search bugs: query terms, index validation, chunk loss
- Include 2-letter terms (AI, ML) in query term extraction (was > 2, now >= 2)
- Add guard for empty data before accessing data[0].data[0] in scorer
- Preserve chunks without content in reranking instead of silently dropping
- Remove legacy wrapper functions (apply_title_boost, apply_popularity_boost)
- Update tests to use apply_source_boosts directly

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 15:01:03 +00:00

584 lines
21 KiB
Python

"""
Tests for search module functions including RRF fusion, query term boosting,
title boosting, and source deduplication.
"""
import pytest
from unittest.mock import MagicMock, patch
from datetime import datetime, timedelta, timezone
from memory.api.search.search import (
extract_query_terms,
apply_query_term_boost,
deduplicate_by_source,
apply_source_boosts,
fuse_scores_rrf,
)
from memory.api.search.constants import (
STOPWORDS,
QUERY_TERM_BOOST,
TITLE_MATCH_BOOST,
POPULARITY_BOOST,
RECENCY_BOOST_MAX,
RECENCY_HALF_LIFE_DAYS,
RRF_K,
)
# ============================================================================
# extract_query_terms tests
# ============================================================================
@pytest.mark.parametrize(
"query,expected",
[
("machine learning algorithms", {"machine", "learning", "algorithms"}),
("MACHINE Learning ALGORITHMS", {"machine", "learning", "algorithms"}),
("", set()),
("the is a an of to", set()), # Only stopwords
],
)
def test_extract_query_terms_basic(query, expected):
"""Should extract meaningful terms, lowercase them, and filter stopwords."""
assert extract_query_terms(query) == expected
@pytest.mark.parametrize(
"query,must_include,must_exclude",
[
(
"the quick brown fox jumps with the lazy dog",
{"quick", "brown", "jumps", "lazy", "fox", "dog"},
{"the", "with"},
),
(
"what is the best approach for neural networks",
{"best", "approach", "neural", "networks"},
{"what", "the", "for"},
),
],
)
def test_extract_query_terms_filtering(query, must_include, must_exclude):
"""Should filter stopwords while keeping meaningful terms."""
terms = extract_query_terms(query)
for term in must_include:
assert term in terms, f"'{term}' should be in terms"
for term in must_exclude:
assert term not in terms, f"'{term}' should not be in terms"
@pytest.mark.parametrize(
"query,included,excluded",
[
# 2-letter terms like "ai", "ml" should be INCLUDED (important acronyms)
# 1-letter words like "a" and stopwords like "is" should be excluded
("AI is a new ML model", {"ai", "ml", "new", "model"}, {"is", "a"}),
],
)
def test_extract_query_terms_short_words(query, included, excluded):
"""Should include 2-letter words but filter 1-letter words and stopwords."""
terms = extract_query_terms(query)
for term in included:
assert term in terms, f"'{term}' should be in terms"
for term in excluded:
assert term not in terms, f"'{term}' should not be in terms"
@pytest.mark.parametrize(
"word",
["the", "is", "are", "was", "were", "be", "been", "have", "has", "had",
"do", "does", "did", "to", "of", "in", "for", "on", "with", "at", "by"],
)
def test_common_stopwords_in_set(word):
"""Verify common stopwords are in the STOPWORDS set."""
assert word in STOPWORDS
# ============================================================================
# apply_query_term_boost tests
# ============================================================================
def _make_chunk(content: str, source_id: int = 1, score: float = 0.5):
"""Create a mock chunk with given content and score."""
chunk = MagicMock()
chunk.content = content
chunk.source_id = source_id
chunk.relevance_score = score
return chunk
@pytest.mark.parametrize(
"content,query_terms,initial_score,expected_boost_fraction",
[
("machine learning is powerful", {"machine", "learning"}, 0.5, 1.0), # Both match
("machine vision systems", {"machine", "learning"}, 0.5, 0.5), # One of two
("deep neural networks", {"machine", "learning"}, 0.5, 0.0), # No match
("MACHINE Learning AlGoRiThMs", {"machine", "learning", "algorithms"}, 0.5, 1.0), # Case insensitive
],
)
def test_apply_query_term_boost(content, query_terms, initial_score, expected_boost_fraction):
"""Should boost chunks based on query term matches."""
chunks = [_make_chunk(content, score=initial_score)]
apply_query_term_boost(chunks, query_terms)
expected = initial_score + QUERY_TERM_BOOST * expected_boost_fraction
assert chunks[0].relevance_score == pytest.approx(expected)
def test_apply_query_term_boost_empty_inputs():
"""Should handle empty query_terms or chunks."""
chunks = [_make_chunk("machine learning", score=0.5)]
apply_query_term_boost(chunks, set())
assert chunks[0].relevance_score == 0.5
apply_query_term_boost([], {"machine"}) # Should not raise
def test_apply_query_term_boost_none_values():
"""Should handle None content and relevance_score."""
chunk_none_content = MagicMock()
chunk_none_content.content = None
chunk_none_content.relevance_score = 0.5
apply_query_term_boost([chunk_none_content], {"machine"})
assert chunk_none_content.relevance_score == 0.5
chunk_none_score = MagicMock()
chunk_none_score.content = "machine learning"
chunk_none_score.relevance_score = None
apply_query_term_boost([chunk_none_score], {"machine", "learning"})
assert chunk_none_score.relevance_score == pytest.approx(QUERY_TERM_BOOST)
def test_apply_query_term_boost_multiple_chunks():
"""Should boost each chunk independently."""
chunks = [
_make_chunk("machine learning", score=0.5),
_make_chunk("deep networks", score=0.6),
_make_chunk("machine vision", score=0.4),
]
query_terms = {"machine", "learning"}
apply_query_term_boost(chunks, query_terms)
assert chunks[0].relevance_score == pytest.approx(0.5 + QUERY_TERM_BOOST)
assert chunks[1].relevance_score == 0.6 # No match
assert chunks[2].relevance_score == pytest.approx(0.4 + QUERY_TERM_BOOST * 0.5)
# ============================================================================
# deduplicate_by_source tests
# ============================================================================
def _make_source_chunk(source_id: int, score: float):
"""Create a mock chunk with given source_id and score."""
chunk = MagicMock()
chunk.source_id = source_id
chunk.relevance_score = score
return chunk
@pytest.mark.parametrize(
"chunks_data,expected_count,expected_scores",
[
# Multiple chunks per source - keep highest
([(1, 0.5), (1, 0.8), (1, 0.3), (2, 0.6)], 2, {1: 0.8, 2: 0.6}),
# Single chunk per source - keep all
([(1, 0.5), (2, 0.6), (3, 0.7)], 3, {1: 0.5, 2: 0.6, 3: 0.7}),
# Empty list
([], 0, {}),
],
)
def test_deduplicate_by_source(chunks_data, expected_count, expected_scores):
"""Should keep only highest scoring chunk per source."""
chunks = [_make_source_chunk(sid, score) for sid, score in chunks_data]
result = deduplicate_by_source(chunks)
assert len(result) == expected_count
for chunk in result:
assert chunk.relevance_score == expected_scores[chunk.source_id]
def test_deduplicate_by_source_preserves_objects():
"""Should return the actual chunk objects, not copies."""
chunk1 = _make_source_chunk(1, 0.5)
chunk2 = _make_source_chunk(1, 0.8)
result = deduplicate_by_source([chunk1, chunk2])
assert result[0] is chunk2
def test_deduplicate_by_source_none_scores():
"""Should handle None relevance_score as 0."""
chunk1 = _make_source_chunk(1, None)
chunk2 = _make_source_chunk(1, 0.5)
result = deduplicate_by_source([chunk1, chunk2])
assert result[0].relevance_score == 0.5
# ============================================================================
# apply_source_boosts tests (title + popularity + recency)
# ============================================================================
def _make_boost_chunk(source_id: int, score: float = 0.5):
"""Create a mock chunk for boost tests."""
chunk = MagicMock()
chunk.source_id = source_id
chunk.relevance_score = score
return chunk
@pytest.mark.parametrize(
"title,query_terms,initial_score,expected_boost_fraction",
[
("Machine Learning Tutorial", {"machine", "learning"}, 0.5, 1.0),
("Machine Vision Systems", {"machine", "learning"}, 0.5, 0.5),
("Deep Neural Networks", {"machine", "learning"}, 0.5, 0.0),
("MACHINE LEARNING Tutorial", {"machine", "learning"}, 0.5, 1.0), # Case insensitive
],
)
@patch("memory.api.search.search.make_session")
def test_apply_source_boosts_title(mock_make_session, title, query_terms, initial_score, expected_boost_fraction):
"""Should boost chunks when title matches query terms."""
mock_session = MagicMock()
mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
mock_source = MagicMock()
mock_source.id = 1
mock_source.title = title
mock_source.popularity = 1.0 # Default popularity, no boost
mock_source.inserted_at = None # No recency boost
mock_session.query.return_value.filter.return_value.all.return_value = [mock_source]
chunks = [_make_boost_chunk(1, initial_score)]
apply_source_boosts(chunks, query_terms)
expected = initial_score + TITLE_MATCH_BOOST * expected_boost_fraction
assert chunks[0].relevance_score == pytest.approx(expected)
def test_apply_source_boosts_empty_inputs():
"""Should not modify chunks if query_terms or chunks is empty."""
chunks = [_make_boost_chunk(1, 0.5)]
apply_source_boosts(chunks, set())
assert chunks[0].relevance_score == 0.5
apply_source_boosts([], {"machine"}) # Should not raise
@patch("memory.api.search.search.make_session")
def test_apply_source_boosts_none_title(mock_make_session):
"""Should handle sources with None or missing title."""
mock_session = MagicMock()
mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
# Source with None title
mock_source = MagicMock()
mock_source.id = 1
mock_source.title = None
mock_source.popularity = 1.0 # Default popularity, no boost
mock_source.inserted_at = None # No recency boost
mock_session.query.return_value.filter.return_value.all.return_value = [mock_source]
chunks = [_make_boost_chunk(1, 0.5)]
apply_source_boosts(chunks, {"machine"})
assert chunks[0].relevance_score == 0.5
@pytest.mark.parametrize(
"popularity,initial_score,expected_multiplier",
[
(1.0, 0.5, 1.0), # Default popularity, no change
(2.0, 0.5, 1.0 + POPULARITY_BOOST), # High popularity
(0.5, 0.5, 1.0 - POPULARITY_BOOST * 0.5), # Low popularity
(1.5, 1.0, 1.0 + POPULARITY_BOOST * 0.5), # Moderate popularity
],
)
@patch("memory.api.search.search.make_session")
def test_apply_source_boosts_popularity(mock_make_session, popularity, initial_score, expected_multiplier):
"""Should boost chunks based on source popularity."""
mock_session = MagicMock()
mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
mock_source = MagicMock()
mock_source.id = 1
mock_source.popularity = popularity
mock_source.inserted_at = None # No recency boost
mock_session.query.return_value.filter.return_value.all.return_value = [mock_source]
chunks = [_make_boost_chunk(1, initial_score)]
apply_source_boosts(chunks, set()) # No query terms, just popularity
expected = initial_score * expected_multiplier
assert chunks[0].relevance_score == pytest.approx(expected)
def test_apply_source_boosts_empty_chunks():
"""Should handle empty chunks list."""
apply_source_boosts([], set()) # Should not raise
@patch("memory.api.search.search.make_session")
def test_apply_source_boosts_multiple_sources(mock_make_session):
"""Should apply different boosts per source."""
mock_session = MagicMock()
mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
source1 = MagicMock()
source1.id = 1
source1.popularity = 2.0 # High karma
source1.inserted_at = None # No recency boost
source2 = MagicMock()
source2.id = 2
source2.popularity = 1.0 # Default
source2.inserted_at = None # No recency boost
mock_session.query.return_value.filter.return_value.all.return_value = [source1, source2]
chunks = [_make_boost_chunk(1, 0.5), _make_boost_chunk(2, 0.5)]
apply_source_boosts(chunks, set())
# Source 1 should be boosted
assert chunks[0].relevance_score == pytest.approx(0.5 * (1.0 + POPULARITY_BOOST))
# Source 2 should be unchanged (popularity = 1.0)
assert chunks[1].relevance_score == 0.5
# ============================================================================
# fuse_scores_rrf tests
# ============================================================================
@pytest.mark.parametrize(
"embedding_scores,bm25_scores,expected_key,expected_score",
[
# Both sources have same ranking
({"a": 0.9, "b": 0.7}, {"a": 0.8, "b": 0.6}, "a", 2 / (RRF_K + 1)),
# Item only in embeddings
({"a": 0.9, "b": 0.7}, {"a": 0.8}, "b", 1 / (RRF_K + 2)),
# Item only in BM25
({"a": 0.9}, {"a": 0.8, "b": 0.7}, "b", 1 / (RRF_K + 2)),
# Single item in both
({"a": 0.9}, {"a": 0.8}, "a", 2 / (RRF_K + 1)),
],
)
def test_fuse_scores_rrf_basic(embedding_scores, bm25_scores, expected_key, expected_score):
"""Should compute RRF scores correctly."""
result = fuse_scores_rrf(embedding_scores, bm25_scores)
assert result[expected_key] == pytest.approx(expected_score)
def test_fuse_scores_rrf_different_rankings():
"""Should handle items ranked differently in each source."""
embedding_scores = {"a": 0.9, "b": 0.5} # a=1, b=2
bm25_scores = {"a": 0.3, "b": 0.8} # b=1, a=2
result = fuse_scores_rrf(embedding_scores, bm25_scores)
# Both should have same RRF score (1/61 + 1/62)
expected = 1 / (RRF_K + 1) + 1 / (RRF_K + 2)
assert result["a"] == pytest.approx(expected)
assert result["b"] == pytest.approx(expected)
@pytest.mark.parametrize(
"embedding_scores,bm25_scores,expected_len",
[
({}, {}, 0),
({}, {"a": 0.8, "b": 0.6}, 2),
({"a": 0.9, "b": 0.7}, {}, 2),
],
)
def test_fuse_scores_rrf_empty_inputs(embedding_scores, bm25_scores, expected_len):
"""Should handle empty inputs gracefully."""
result = fuse_scores_rrf(embedding_scores, bm25_scores)
assert len(result) == expected_len
def test_fuse_scores_rrf_many_items():
"""Should handle many items correctly."""
embedding_scores = {str(i): 1.0 - i * 0.01 for i in range(100)}
bm25_scores = {str(i): 1.0 - i * 0.01 for i in range(100)}
result = fuse_scores_rrf(embedding_scores, bm25_scores)
assert len(result) == 100
assert result["0"] > result["99"] # First should have highest score
def test_fuse_scores_rrf_only_ranks_matter():
"""RRF should only care about ranks, not score magnitudes."""
# Same ranking, different score scales
result1 = fuse_scores_rrf(
{"a": 0.99, "b": 0.98, "c": 0.97},
{"a": 100, "b": 50, "c": 1},
)
result2 = fuse_scores_rrf(
{"a": 0.5, "b": 0.4, "c": 0.3},
{"a": 0.9, "b": 0.8, "c": 0.7},
)
# RRF scores should be identical since rankings are the same
assert result1["a"] == pytest.approx(result2["a"])
assert result1["b"] == pytest.approx(result2["b"])
assert result1["c"] == pytest.approx(result2["c"])
# ============================================================================
# apply_source_boosts recency tests
# ============================================================================
def _make_recency_chunk(source_id: int, score: float = 0.5):
"""Create a mock chunk for recency boost tests."""
chunk = MagicMock()
chunk.source_id = source_id
chunk.relevance_score = score
return chunk
@patch("memory.api.search.search.make_session")
def test_recency_boost_new_content(mock_make_session):
"""Brand new content should get full recency boost."""
mock_session = MagicMock()
mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
now = datetime.now(timezone.utc)
mock_source = MagicMock()
mock_source.id = 1
mock_source.title = None
mock_source.popularity = 1.0
mock_source.inserted_at = now # Just inserted
mock_session.query.return_value.filter.return_value.all.return_value = [mock_source]
chunks = [_make_recency_chunk(1, 0.5)]
apply_source_boosts(chunks, set())
# Should get nearly full recency boost
expected = 0.5 + RECENCY_BOOST_MAX
assert chunks[0].relevance_score == pytest.approx(expected, rel=0.01)
@patch("memory.api.search.search.make_session")
def test_recency_boost_half_life_decay(mock_make_session):
"""Content at half-life age should get half the boost."""
mock_session = MagicMock()
mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
now = datetime.now(timezone.utc)
mock_source = MagicMock()
mock_source.id = 1
mock_source.title = None
mock_source.popularity = 1.0
mock_source.inserted_at = now - timedelta(days=RECENCY_HALF_LIFE_DAYS)
mock_session.query.return_value.filter.return_value.all.return_value = [mock_source]
chunks = [_make_recency_chunk(1, 0.5)]
apply_source_boosts(chunks, set())
# Should get half the recency boost
expected = 0.5 + RECENCY_BOOST_MAX * 0.5
assert chunks[0].relevance_score == pytest.approx(expected, rel=0.01)
@patch("memory.api.search.search.make_session")
def test_recency_boost_old_content(mock_make_session):
"""Very old content should get minimal recency boost."""
mock_session = MagicMock()
mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
now = datetime.now(timezone.utc)
mock_source = MagicMock()
mock_source.id = 1
mock_source.title = None
mock_source.popularity = 1.0
mock_source.inserted_at = now - timedelta(days=365) # 1 year old
mock_session.query.return_value.filter.return_value.all.return_value = [mock_source]
chunks = [_make_recency_chunk(1, 0.5)]
apply_source_boosts(chunks, set())
# Should get very little boost (about 0.5^4 ≈ 0.0625 of max)
assert chunks[0].relevance_score > 0.5
assert chunks[0].relevance_score < 0.5 + RECENCY_BOOST_MAX * 0.1
@patch("memory.api.search.search.make_session")
def test_recency_boost_none_timestamp(mock_make_session):
"""Should handle None inserted_at gracefully."""
mock_session = MagicMock()
mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
mock_source = MagicMock()
mock_source.id = 1
mock_source.title = None
mock_source.popularity = 1.0
mock_source.inserted_at = None
mock_session.query.return_value.filter.return_value.all.return_value = [mock_source]
chunks = [_make_recency_chunk(1, 0.5)]
apply_source_boosts(chunks, set())
# No recency boost applied
assert chunks[0].relevance_score == 0.5
@patch("memory.api.search.search.make_session")
def test_recency_boost_timezone_naive(mock_make_session):
"""Should handle timezone-naive timestamps."""
mock_session = MagicMock()
mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
# Timezone-naive timestamp
naive_dt = datetime.now().replace(tzinfo=None)
mock_source = MagicMock()
mock_source.id = 1
mock_source.title = None
mock_source.popularity = 1.0
mock_source.inserted_at = naive_dt
mock_session.query.return_value.filter.return_value.all.return_value = [mock_source]
chunks = [_make_recency_chunk(1, 0.5)]
apply_source_boosts(chunks, set()) # Should not raise
# Should get nearly full boost since it's very recent
assert chunks[0].relevance_score > 0.5
@patch("memory.api.search.search.make_session")
def test_recency_boost_ordering(mock_make_session):
"""Newer content should rank higher than older content."""
mock_session = MagicMock()
mock_make_session.return_value.__enter__ = MagicMock(return_value=mock_session)
mock_make_session.return_value.__exit__ = MagicMock(return_value=None)
now = datetime.now(timezone.utc)
source_new = MagicMock()
source_new.id = 1
source_new.title = None
source_new.popularity = 1.0
source_new.inserted_at = now - timedelta(days=1)
source_old = MagicMock()
source_old.id = 2
source_old.title = None
source_old.popularity = 1.0
source_old.inserted_at = now - timedelta(days=180)
mock_session.query.return_value.filter.return_value.all.return_value = [source_new, source_old]
chunks = [_make_recency_chunk(1, 0.5), _make_recency_chunk(2, 0.5)]
apply_source_boosts(chunks, set())
# Newer content should have higher score
assert chunks[0].relevance_score > chunks[1].relevance_score