diff --git a/INVESTIGATION.md b/INVESTIGATION.md index f79c68f..4b32618 100644 --- a/INVESTIGATION.md +++ b/INVESTIGATION.md @@ -40,13 +40,14 @@ This investigation identified **100+ issues** across 7 areas of the memory syste - **Impact:** Arbitrary file read on server filesystem - **Fix:** Add path resolution validation with `.resolve()` and prefix check -### BUG-002: Collection Mismatch (1,338 items) -- **Severity:** CRITICAL +### BUG-002: Collection Mismatch ✅ INVESTIGATED & FIXED +- **Severity:** MEDIUM (not as critical as originally thought) - **Area:** Data/Embedding Pipeline -- **Description:** Mail items have chunks with `collection_name='text'` but vectors stored in Qdrant's `mail` collection -- **Impact:** Items completely unsearchable -- **Evidence:** 1,338 orphaned vectors in mail, 1,338 missing in text -- **Fix:** Re-sync vectors or update chunk collection_name +- **Description:** BookSection._chunk_contents() called extract_text() without specifying modality, defaulting to "text" +- **Impact:** 9,370 book chunks stored in text collection instead of book +- **Root Cause:** `extract_text()` defaults to `modality="text"` but BookSection didn't override it +- **Fix Applied:** Added `modality="book"` to BookSection._chunk_contents() DataChunk creation +- **Note:** Original 1,338 mail items investigation was outdated - current mismatch is 24 mail->text chunks which are actually email attachments (correct behavior) ### BUG-003: BM25 Filters Completely Ignored - **Severity:** CRITICAL diff --git a/src/memory/common/db/models/source_items.py b/src/memory/common/db/models/source_items.py index a6d1ba9..84de0d0 100644 --- a/src/memory/common/db/models/source_items.py +++ b/src/memory/common/db/models/source_items.py @@ -606,7 +606,9 @@ class BookSection(SourceItem): return [] if len([p for p in self.pages if p.strip()]) == 1: - chunks = extract.extract_text(content, metadata={"type": "page"}) + chunks = extract.extract_text( + content, metadata={"type": "page"}, modality="book" + ) if len(chunks) > 1: chunks[-1].metadata["type"] = "summary" return chunks @@ -614,10 +616,10 @@ class BookSection(SourceItem): summary, tags = summarizer.summarize(content) return [ extract.DataChunk( - data=[content], metadata={"type": "section", "tags": tags} + data=[content], metadata={"type": "section", "tags": tags}, modality="book" ), extract.DataChunk( - data=[summary], metadata={"type": "summary", "tags": tags} + data=[summary], metadata={"type": "summary", "tags": tags}, modality="book" ), ]