diff --git a/requirements-api.txt b/requirements-api.txt
index fa6732f..aba6011 100644
--- a/requirements-api.txt
+++ b/requirements-api.txt
@@ -3,4 +3,5 @@ uvicorn==0.29.0
python-jose==3.3.0
python-multipart==0.0.9
sqladmin
-mcp==1.9.2
\ No newline at end of file
+mcp==1.9.2
+bm25s[full]==0.2.13
\ No newline at end of file
diff --git a/requirements-common.txt b/requirements-common.txt
index dbf0da5..d07028b 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -6,5 +6,5 @@ dotenv==0.9.9
voyageai==0.3.2
qdrant-client==1.9.0
anthropic==0.18.1
-
-bm25s[full]==0.2.13
\ No newline at end of file
+# Pin the httpx version, as newer versions break the anthropic client
+httpx==0.27.0
\ No newline at end of file
diff --git a/run_celery_task.py b/run_celery_task.py
index e9a13f2..146c230 100644
--- a/run_celery_task.py
+++ b/run_celery_task.py
@@ -38,6 +38,7 @@ from memory.workers.tasks.maintenance import (
CLEAN_COLLECTION,
REINGEST_CHUNK,
REINGEST_EMPTY_SOURCE_ITEMS,
+ REINGEST_ALL_EMPTY_SOURCE_ITEMS,
REINGEST_ITEM,
REINGEST_MISSING_CHUNKS,
UPDATE_METADATA_FOR_ITEM,
@@ -67,6 +68,7 @@ TASK_MAPPINGS = {
"reingest_chunk": REINGEST_CHUNK,
"reingest_item": REINGEST_ITEM,
"reingest_empty_source_items": REINGEST_EMPTY_SOURCE_ITEMS,
+ "reingest_all_empty_source_items": REINGEST_ALL_EMPTY_SOURCE_ITEMS,
"update_metadata_for_item": UPDATE_METADATA_FOR_ITEM,
"update_metadata_for_source_items": UPDATE_METADATA_FOR_SOURCE_ITEMS,
},
@@ -316,6 +318,13 @@ def maintenance_reingest_empty_source_items(ctx, item_type):
execute_task(ctx, "maintenance", "reingest_empty_source_items", item_type=item_type)
+@maintenance.command("reingest-all-empty-source-items")
+@click.pass_context
+def maintenance_reingest_all_empty_source_items(ctx):
+ """Reingest all empty source items."""
+ execute_task(ctx, "maintenance", "reingest_all_empty_source_items")
+
+
@maintenance.command("reingest-chunk")
@click.option("--chunk-id", required=True, help="Chunk ID to reingest")
@click.pass_context
diff --git a/src/memory/api/MCP/tools.py b/src/memory/api/MCP/tools.py
index 77ba76b..f68533c 100644
--- a/src/memory/api/MCP/tools.py
+++ b/src/memory/api/MCP/tools.py
@@ -15,14 +15,60 @@ from memory.common.db.connection import make_session
from memory.common import extract
from memory.common.db.models import AgentObservation
-from memory.api.search import search, SearchFilters
+from memory.api.search.search import search, SearchFilters
from memory.common.formatters import observation
from memory.workers.tasks.content_processing import process_content_item
+from memory.common.collections import ALL_COLLECTIONS, OBSERVATION_COLLECTIONS
logger = logging.getLogger(__name__)
# Create MCP server instance
-mcp = FastMCP("memory", stateless=True)
+mcp = FastMCP("memory", stateless_http=True)
+
+
+def filter_observation_source_ids(
+ tags: list[str] | None = None, observation_types: list[str] | None = None
+):
+ if not tags and not observation_types:
+ return None
+
+ with make_session() as session:
+ items_query = session.query(AgentObservation.id)
+
+ if tags:
+ # Use PostgreSQL array overlap operator with proper array casting
+ items_query = items_query.filter(
+ AgentObservation.tags.op("&&")(sql_cast(tags, ARRAY(Text))),
+ )
+ if observation_types:
+ items_query = items_query.filter(
+ AgentObservation.observation_type.in_(observation_types)
+ )
+ source_ids = [item.id for item in items_query.all()]
+
+ return source_ids
+
+
+def filter_source_ids(
+ modalities: set[str],
+ tags: list[str] | None = None,
+):
+ if not tags:
+ return None
+
+ with make_session() as session:
+ items_query = session.query(SourceItem.id)
+
+ if tags:
+ # Use PostgreSQL array overlap operator with proper array casting
+ items_query = items_query.filter(
+ SourceItem.tags.op("&&")(sql_cast(tags, ARRAY(Text))),
+ )
+ if modalities:
+ items_query = items_query.filter(SourceItem.modality.in_(modalities))
+ source_ids = [item.id for item in items_query.all()]
+
+ return source_ids
@mcp.tool()
@@ -48,20 +94,6 @@ async def get_all_tags() -> list[str]:
- Projects: "project:website-redesign"
- Contexts: "context:work", "context:late-night"
- Domains: "domain:finance"
-
- Example:
- # Get all tags to ensure consistency
- tags = await get_all_tags()
- # Returns: ["ai-safety", "context:work", "functional-programming",
- # "machine-learning", "project:thesis", ...]
-
- # Use to check if a topic has been discussed before
- if "quantum-computing" in tags:
- # Search for related observations
- observations = await search_observations(
- query="quantum computing",
- tags=["quantum-computing"]
- )
"""
with make_session() as session:
tags_query = session.query(func.unnest(SourceItem.tags)).distinct()
@@ -93,26 +125,6 @@ async def get_all_subjects() -> list[str]:
- "ai_beliefs", "ai_safety_beliefs"
- "learning_preferences"
- "communication_style"
-
- Example:
- # Get all subjects to ensure consistency
- subjects = await get_all_subjects()
- # Returns: ["ai_safety_beliefs", "architecture_preferences",
- # "programming_philosophy", "work_schedule", ...]
-
- # Use to check what we know about the user
- if "programming_style" in subjects:
- # Get all programming-related observations
- observations = await search_observations(
- query="programming",
- subject="programming_style"
- )
-
- Best practices:
- - Always check existing subjects before creating new ones
- - Use snake_case for consistency
- - Be specific but not too granular
- - Group related observations under same subject
"""
with make_session() as session:
return sorted(
@@ -146,20 +158,6 @@ async def get_all_observation_types() -> list[str]:
Returns:
List of observation types that have actually been used in the system.
-
- Example:
- # Check what types of observations exist
- types = await get_all_observation_types()
- # Returns: ["behavior", "belief", "contradiction", "preference"]
-
- # Use to analyze observation distribution
- for obs_type in types:
- observations = await search_observations(
- query="",
- observation_types=[obs_type],
- limit=100
- )
- print(f"{obs_type}: {len(observations)} observations")
"""
with make_session() as session:
return sorted(
@@ -173,7 +171,11 @@ async def get_all_observation_types() -> list[str]:
@mcp.tool()
async def search_knowledge_base(
- query: str, previews: bool = False, modalities: list[str] = [], limit: int = 10
+ query: str,
+ previews: bool = False,
+ modalities: set[str] = set(),
+ tags: list[str] = [],
+ limit: int = 10,
) -> list[dict]:
"""
Search through the user's stored knowledge and content.
@@ -283,14 +285,22 @@ async def search_knowledge_base(
"""
logger.info(f"MCP search for: {query}")
+ if not modalities:
+ modalities = set(ALL_COLLECTIONS.keys())
+ modalities = set(modalities) & ALL_COLLECTIONS.keys() - OBSERVATION_COLLECTIONS
+
upload_data = extract.extract_text(query)
results = await search(
upload_data,
previews=previews,
modalities=modalities,
limit=limit,
- min_text_score=0.3,
- min_multimodal_score=0.3,
+ min_text_score=0.4,
+ min_multimodal_score=0.25,
+ filters=SearchFilters(
+ tags=tags,
+ source_ids=filter_source_ids(tags=tags, modalities=modalities),
+ ),
)
# Convert SearchResult objects to dictionaries for MCP
@@ -456,12 +466,13 @@ async def observe(
mime_type="text/plain",
sha256=sha256(f"{content}{subject}{observation_type}".encode("utf-8")).digest(),
inserted_at=datetime.now(timezone.utc),
+ modality="observation",
)
try:
with make_session() as session:
process_content_item(observation, session)
- if not observation.id:
+ if not cast(int | None, observation.id):
raise ValueError("Observation not created")
logger.info(
@@ -600,24 +611,6 @@ async def search_observations(
- Higher confidence observations are more reliable
- Recent observations may override older ones on same topic
"""
- source_ids = None
- if tags or observation_types:
- with make_session() as session:
- items_query = session.query(AgentObservation.id)
-
- if tags:
- # Use PostgreSQL array overlap operator with proper array casting
- items_query = items_query.filter(
- AgentObservation.tags.op("&&")(sql_cast(tags, ARRAY(Text))),
- )
- if observation_types:
- items_query = items_query.filter(
- AgentObservation.observation_type.in_(observation_types)
- )
- source_ids = [item.id for item in items_query.all()]
- if not source_ids:
- return []
-
semantic_text = observation.generate_semantic_text(
subject=subject or "",
observation_type="".join(observation_types or []),
@@ -637,18 +630,24 @@ async def search_observations(
extract.DataChunk(data=[temporal]),
],
previews=True,
- modalities=["semantic", "temporal"],
+ modalities={"semantic", "temporal"},
limit=limit,
- min_text_score=0.8,
filters=SearchFilters(
subject=subject,
confidence=min_confidence,
tags=tags,
observation_types=observation_types,
- source_ids=source_ids,
+ source_ids=filter_observation_source_ids(tags=tags),
),
+ timeout=2,
)
return [
- cast(dict, cast(dict, result.model_dump()).get("content")) for result in results
+ {
+ "content": r.content,
+ "tags": r.tags,
+ "created_at": r.created_at.isoformat() if r.created_at else None,
+ "metadata": r.metadata,
+ }
+ for r in results
]
diff --git a/src/memory/api/app.py b/src/memory/api/app.py
index 4f37c39..51a9d22 100644
--- a/src/memory/api/app.py
+++ b/src/memory/api/app.py
@@ -3,6 +3,7 @@ FastAPI application for the knowledge base.
"""
import contextlib
+import os
import pathlib
import logging
from typing import Annotated, Optional
@@ -105,12 +106,16 @@ def get_file_by_path(path: str):
return FileResponse(path=file_path, filename=file_path.name)
-def main():
+def main(reload: bool = False):
"""Run the FastAPI server in debug mode with auto-reloading."""
import uvicorn
uvicorn.run(
- "memory.api.app:app", host="0.0.0.0", port=8000, reload=True, log_level="debug"
+ "memory.api.app:app",
+ host="0.0.0.0",
+ port=8000,
+ reload=reload,
+ log_level="debug",
)
@@ -118,4 +123,4 @@ if __name__ == "__main__":
from memory.common.qdrant import setup_qdrant
setup_qdrant()
- main()
+ main(os.getenv("RELOAD", "false") == "true")
diff --git a/src/memory/api/search.py b/src/memory/api/search.py
deleted file mode 100644
index 6b4b501..0000000
--- a/src/memory/api/search.py
+++ /dev/null
@@ -1,382 +0,0 @@
-"""
-Search endpoints for the knowledge base API.
-"""
-
-import asyncio
-import base64
-from hashlib import sha256
-import io
-import logging
-from collections import defaultdict
-from typing import Any, Callable, Optional, TypedDict, NotRequired
-
-import bm25s
-import Stemmer
-import qdrant_client
-from PIL import Image
-from pydantic import BaseModel
-from qdrant_client.http import models as qdrant_models
-
-from memory.common import embedding, extract, qdrant, settings
-from memory.common.collections import (
- ALL_COLLECTIONS,
- MULTIMODAL_COLLECTIONS,
- TEXT_COLLECTIONS,
-)
-from memory.common.db.connection import make_session
-from memory.common.db.models import Chunk
-
-logger = logging.getLogger(__name__)
-
-
-class AnnotatedChunk(BaseModel):
- id: str
- score: float
- metadata: dict
- preview: Optional[str | None] = None
-
-
-class SourceData(BaseModel):
- """Holds source item data to avoid SQLAlchemy session issues"""
-
- id: int
- size: int | None
- mime_type: str | None
- filename: str | None
- content: str | dict | None
- content_length: int
-
-
-class SearchResponse(BaseModel):
- collection: str
- results: list[dict]
-
-
-class SearchResult(BaseModel):
- id: int
- size: int
- mime_type: str
- chunks: list[AnnotatedChunk]
- content: Optional[str | dict] = None
- filename: Optional[str] = None
-
-
-class SearchFilters(TypedDict):
- subject: NotRequired[str | None]
- confidence: NotRequired[float]
- tags: NotRequired[list[str] | None]
- observation_types: NotRequired[list[str] | None]
- source_ids: NotRequired[list[int] | None]
-
-
-async def with_timeout(
- call, timeout: int = 2
-) -> list[tuple[SourceData, AnnotatedChunk]]:
- """
- Run a function with a timeout.
-
- Args:
- call: The function to run
- timeout: The timeout in seconds
- """
- try:
- return await asyncio.wait_for(call, timeout=timeout)
- except TimeoutError:
- logger.warning(f"Search timed out after {timeout}s")
- return []
- except Exception as e:
- logger.error(f"Search failed: {e}")
- return []
-
-
-def annotated_chunk(
- chunk: Chunk, search_result: qdrant_models.ScoredPoint, previews: bool
-) -> tuple[SourceData, AnnotatedChunk]:
- def serialize_item(item: bytes | str | Image.Image) -> str | None:
- if not previews and not isinstance(item, str):
- return None
- if not previews and isinstance(item, str):
- return item[:100]
-
- if isinstance(item, Image.Image):
- buffer = io.BytesIO()
- format = item.format or "PNG"
- item.save(buffer, format=format)
- mime_type = f"image/{format.lower()}"
- return f"data:{mime_type};base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
- elif isinstance(item, bytes):
- return base64.b64encode(item).decode("utf-8")
- elif isinstance(item, str):
- return item
- else:
- raise ValueError(f"Unsupported item type: {type(item)}")
-
- metadata = search_result.payload or {}
- metadata = {
- k: v
- for k, v in metadata.items()
- if k not in ["content", "filename", "size", "content_type", "tags"]
- }
-
- # Prefetch all needed source data while in session
- source = chunk.source
- source_data = SourceData(
- id=source.id,
- size=source.size,
- mime_type=source.mime_type,
- filename=source.filename,
- content=source.display_contents,
- content_length=len(source.content) if source.content else 0,
- )
-
- return source_data, AnnotatedChunk(
- id=str(chunk.id),
- score=search_result.score,
- metadata=metadata,
- preview=serialize_item(chunk.data[0]) if chunk.data else None,
- )
-
-
-def group_chunks(chunks: list[tuple[SourceData, AnnotatedChunk]]) -> list[SearchResult]:
- items = defaultdict(list)
- source_lookup = {}
-
- for source, chunk in chunks:
- items[source.id].append(chunk)
- source_lookup[source.id] = source
-
- return [
- SearchResult(
- id=source.id,
- size=source.size or source.content_length,
- mime_type=source.mime_type or "text/plain",
- filename=source.filename
- and source.filename.replace(
- str(settings.FILE_STORAGE_DIR).lstrip("/"), "/files"
- ),
- content=source.content,
- chunks=sorted(chunks, key=lambda x: x.score, reverse=True),
- )
- for source_id, chunks in items.items()
- for source in [source_lookup[source_id]]
- ]
-
-
-def query_chunks(
- client: qdrant_client.QdrantClient,
- upload_data: list[extract.DataChunk],
- allowed_modalities: set[str],
- embedder: Callable,
- min_score: float = 0.0,
- limit: int = 10,
- filters: dict[str, Any] | None = None,
-) -> dict[str, list[qdrant_models.ScoredPoint]]:
- if not upload_data or not allowed_modalities:
- return {}
-
- chunks = [chunk for chunk in upload_data if chunk.data]
- if not chunks:
- logger.error(f"No chunks to embed for {allowed_modalities}")
- return {}
-
- logger.error(f"Embedding {len(chunks)} chunks for {allowed_modalities}")
- for c in chunks:
- logger.error(f"Chunk: {c.data}")
- vectors = embedder([c.data for c in chunks], input_type="query")
-
- return {
- collection: [
- r
- for vector in vectors
- for r in qdrant.search_vectors(
- client=client,
- collection_name=collection,
- query_vector=vector,
- limit=limit,
- filter_params=filters,
- )
- if r.score >= min_score
- ]
- for collection in allowed_modalities
- }
-
-
-async def search_bm25(
- query: str,
- modalities: list[str],
- limit: int = 10,
- filters: SearchFilters = SearchFilters(),
-) -> list[tuple[SourceData, AnnotatedChunk]]:
- with make_session() as db:
- items_query = db.query(Chunk.id, Chunk.content).filter(
- Chunk.collection_name.in_(modalities)
- )
- if source_ids := filters.get("source_ids"):
- items_query = items_query.filter(Chunk.source_id.in_(source_ids))
- items = items_query.all()
- item_ids = {
- sha256(item.content.lower().strip().encode("utf-8")).hexdigest(): item.id
- for item in items
- }
- corpus = [item.content.lower().strip() for item in items]
-
- stemmer = Stemmer.Stemmer("english")
- corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)
- retriever = bm25s.BM25()
- retriever.index(corpus_tokens)
-
- query_tokens = bm25s.tokenize(query, stemmer=stemmer)
- results, scores = retriever.retrieve(
- query_tokens, k=min(limit, len(corpus)), corpus=corpus
- )
-
- item_scores = {
- item_ids[sha256(doc.encode("utf-8")).hexdigest()]: score
- for doc, score in zip(results[0], scores[0])
- }
-
- with make_session() as db:
- chunks = db.query(Chunk).filter(Chunk.id.in_(item_scores.keys())).all()
- results = []
- for chunk in chunks:
- # Prefetch all needed source data while in session
- source = chunk.source
- source_data = SourceData(
- id=source.id,
- size=source.size,
- mime_type=source.mime_type,
- filename=source.filename,
- content=source.display_contents,
- content_length=len(source.content) if source.content else 0,
- )
-
- annotated = AnnotatedChunk(
- id=str(chunk.id),
- score=item_scores[chunk.id],
- metadata=source.as_payload(),
- preview=None,
- )
- results.append((source_data, annotated))
-
- return results
-
-
-async def search_embeddings(
- data: list[extract.DataChunk],
- previews: Optional[bool] = False,
- modalities: set[str] = set(),
- limit: int = 10,
- min_score: float = 0.3,
- filters: SearchFilters = SearchFilters(),
- multimodal: bool = False,
-) -> list[tuple[SourceData, AnnotatedChunk]]:
- """
- Search across knowledge base using text query and optional files.
-
- Parameters:
- - data: List of data to search in (e.g., text, images, files)
- - previews: Whether to include previews in the search results
- - modalities: List of modalities to search in (e.g., "text", "photo", "doc")
- - limit: Maximum number of results
- - min_score: Minimum score to include in the search results
- - filters: Filters to apply to the search results
- - multimodal: Whether to search in multimodal collections
- """
- query_filters = {
- "must": [
- {"key": "confidence", "range": {"gte": filters.get("confidence", 0.5)}},
- ],
- }
- if tags := filters.get("tags"):
- query_filters["must"] += [{"key": "tags", "match": {"any": tags}}]
- if observation_types := filters.get("observation_types"):
- query_filters["must"] += [
- {"key": "observation_type", "match": {"any": observation_types}}
- ]
-
- client = qdrant.get_qdrant_client()
- results = query_chunks(
- client,
- data,
- modalities,
- embedding.embed_text if not multimodal else embedding.embed_mixed,
- min_score=min_score,
- limit=limit,
- filters=query_filters,
- )
- search_results = {k: results.get(k, []) for k in modalities}
-
- found_chunks = {
- str(r.id): r for results in search_results.values() for r in results
- }
- with make_session() as db:
- chunks = db.query(Chunk).filter(Chunk.id.in_(found_chunks.keys())).all()
- return [
- annotated_chunk(chunk, found_chunks[str(chunk.id)], previews or False)
- for chunk in chunks
- ]
-
-
-async def search(
- data: list[extract.DataChunk],
- previews: Optional[bool] = False,
- modalities: list[str] = [],
- limit: int = 10,
- min_text_score: float = 0.3,
- min_multimodal_score: float = 0.3,
- filters: SearchFilters = {},
-) -> list[SearchResult]:
- """
- Search across knowledge base using text query and optional files.
-
- Parameters:
- - query: Optional text search query
- - modalities: List of modalities to search in (e.g., "text", "photo", "doc")
- - files: Optional files to include in the search context
- - limit: Maximum number of results per modality
-
- Returns:
- - List of search results sorted by score
- """
- allowed_modalities = set(modalities or ALL_COLLECTIONS.keys())
-
- text_embeddings_results = with_timeout(
- search_embeddings(
- data,
- previews,
- allowed_modalities & TEXT_COLLECTIONS,
- limit,
- min_text_score,
- filters,
- multimodal=False,
- )
- )
- multimodal_embeddings_results = with_timeout(
- search_embeddings(
- data,
- previews,
- allowed_modalities & MULTIMODAL_COLLECTIONS,
- limit,
- min_multimodal_score,
- filters,
- multimodal=True,
- )
- )
- bm25_results = with_timeout(
- search_bm25(
- " ".join([c for chunk in data for c in chunk.data if isinstance(c, str)]),
- modalities,
- limit=limit,
- filters=filters,
- )
- )
-
- results = await asyncio.gather(
- text_embeddings_results,
- multimodal_embeddings_results,
- bm25_results,
- return_exceptions=False,
- )
-
- results = group_chunks([c for r in results for c in r])
- return sorted(results, key=lambda x: max(c.score for c in x.chunks), reverse=True)
diff --git a/src/memory/api/search/__init__.py b/src/memory/api/search/__init__.py
new file mode 100644
index 0000000..f3b1cf1
--- /dev/null
+++ b/src/memory/api/search/__init__.py
@@ -0,0 +1,4 @@
+from .search import search
+from .utils import SearchResult, SearchFilters
+
+__all__ = ["search", "SearchResult", "SearchFilters"]
diff --git a/src/memory/api/search/bm25.py b/src/memory/api/search/bm25.py
new file mode 100644
index 0000000..9acb098
--- /dev/null
+++ b/src/memory/api/search/bm25.py
@@ -0,0 +1,68 @@
+"""
+Search endpoints for the knowledge base API.
+"""
+
+from hashlib import sha256
+import logging
+
+import bm25s
+import Stemmer
+from memory.api.search.utils import SourceData, AnnotatedChunk, SearchFilters
+
+from memory.common.db.connection import make_session
+from memory.common.db.models import Chunk
+
+logger = logging.getLogger(__name__)
+
+
+async def search_bm25(
+ query: str,
+ modalities: set[str],
+ limit: int = 10,
+ filters: SearchFilters = SearchFilters(),
+) -> list[tuple[SourceData, AnnotatedChunk]]:
+ with make_session() as db:
+ items_query = db.query(Chunk.id, Chunk.content).filter(
+ Chunk.collection_name.in_(modalities)
+ )
+ if source_ids := filters.get("source_ids"):
+ items_query = items_query.filter(Chunk.source_id.in_(source_ids))
+ items = items_query.all()
+ item_ids = {
+ sha256(item.content.lower().strip().encode("utf-8")).hexdigest(): item.id
+ for item in items
+ }
+ corpus = [item.content.lower().strip() for item in items]
+
+ stemmer = Stemmer.Stemmer("english")
+ corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)
+ retriever = bm25s.BM25()
+ retriever.index(corpus_tokens)
+
+ query_tokens = bm25s.tokenize(query, stemmer=stemmer)
+ results, scores = retriever.retrieve(
+ query_tokens, k=min(limit, len(corpus)), corpus=corpus
+ )
+
+ item_scores = {
+ item_ids[sha256(doc.encode("utf-8")).hexdigest()]: score
+ for doc, score in zip(results[0], scores[0])
+ }
+
+ with make_session() as db:
+ chunks = db.query(Chunk).filter(Chunk.id.in_(item_scores.keys())).all()
+ results = []
+ for chunk in chunks:
+ # Prefetch all needed source data while in session
+ source_data = SourceData.from_chunk(chunk)
+
+ annotated = AnnotatedChunk(
+ id=str(chunk.id),
+ score=item_scores[chunk.id],
+ metadata=chunk.source.as_payload(),
+ preview=None,
+ search_method="bm25",
+ )
+ results.append((source_data, annotated))
+
+ return results
diff --git a/src/memory/api/search/embeddings.py b/src/memory/api/search/embeddings.py
new file mode 100644
index 0000000..132b6bb
--- /dev/null
+++ b/src/memory/api/search/embeddings.py
@@ -0,0 +1,144 @@
+import base64
+import io
+import logging
+from typing import Any, Callable, Optional
+
+import qdrant_client
+from PIL import Image
+from qdrant_client.http import models as qdrant_models
+
+from memory.common import embedding, extract, qdrant
+from memory.common.db.connection import make_session
+from memory.common.db.models import Chunk
+from memory.api.search.utils import SourceData, AnnotatedChunk, SearchFilters
+
+logger = logging.getLogger(__name__)
+
+
+def annotated_chunk(
+ chunk: Chunk, search_result: qdrant_models.ScoredPoint, previews: bool
+) -> tuple[SourceData, AnnotatedChunk]:
+ def serialize_item(item: bytes | str | Image.Image) -> str | None:
+ if not previews and not isinstance(item, str):
+ return None
+ if not previews and isinstance(item, str):
+ return item[:100]
+
+ if isinstance(item, Image.Image):
+ buffer = io.BytesIO()
+ format = item.format or "PNG"
+ item.save(buffer, format=format)
+ mime_type = f"image/{format.lower()}"
+ return f"data:{mime_type};base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
+ elif isinstance(item, bytes):
+ return base64.b64encode(item).decode("utf-8")
+ elif isinstance(item, str):
+ return item
+ else:
+ raise ValueError(f"Unsupported item type: {type(item)}")
+
+ metadata = search_result.payload or {}
+ metadata = {
+ k: v
+ for k, v in metadata.items()
+ if k not in ["content", "filename", "size", "content_type", "tags"]
+ }
+
+ # Prefetch all needed source data while in session
+ return SourceData.from_chunk(chunk), AnnotatedChunk(
+ id=str(chunk.id),
+ score=search_result.score,
+ metadata=metadata,
+ preview=serialize_item(chunk.data[0]) if chunk.data else None,
+ search_method="embeddings",
+ )
+
+
+def query_chunks(
+ client: qdrant_client.QdrantClient,
+ upload_data: list[extract.DataChunk],
+ allowed_modalities: set[str],
+ embedder: Callable,
+ min_score: float = 0.3,
+ limit: int = 10,
+ filters: dict[str, Any] | None = None,
+) -> dict[str, list[qdrant_models.ScoredPoint]]:
+ if not upload_data or not allowed_modalities:
+ return {}
+
+ chunks = [chunk for chunk in upload_data if chunk.data]
+ if not chunks:
+ logger.error(f"No chunks to embed for {allowed_modalities}")
+ return {}
+
+ vectors = embedder(chunks, input_type="query")
+
+ return {
+ collection: [
+ r
+ for vector in vectors
+ for r in qdrant.search_vectors(
+ client=client,
+ collection_name=collection,
+ query_vector=vector,
+ limit=limit,
+ filter_params=filters,
+ )
+ if r.score >= min_score
+ ]
+ for collection in allowed_modalities
+ }
+
+
+async def search_embeddings(
+ data: list[extract.DataChunk],
+ previews: Optional[bool] = False,
+ modalities: set[str] = set(),
+ limit: int = 10,
+ min_score: float = 0.3,
+ filters: SearchFilters = SearchFilters(),
+ multimodal: bool = False,
+) -> list[tuple[SourceData, AnnotatedChunk]]:
+ """
+ Search across knowledge base using text query and optional files.
+
+ Parameters:
+ - data: List of data to search in (e.g., text, images, files)
+ - previews: Whether to include previews in the search results
+ - modalities: List of modalities to search in (e.g., "text", "photo", "doc")
+ - limit: Maximum number of results
+ - min_score: Minimum score to include in the search results
+ - filters: Filters to apply to the search results
+ - multimodal: Whether to search in multimodal collections
+ """
+ query_filters = {}
+ if confidence := filters.get("confidence"):
+ query_filters["must"] += [{"key": "confidence", "range": {"gte": confidence}}]
+ if tags := filters.get("tags"):
+ query_filters["must"] += [{"key": "tags", "match": {"any": tags}}]
+ if observation_types := filters.get("observation_types"):
+ query_filters["must"] += [
+ {"key": "observation_type", "match": {"any": observation_types}}
+ ]
+
+ client = qdrant.get_qdrant_client()
+ results = query_chunks(
+ client,
+ data,
+ modalities,
+ embedding.embed_text if not multimodal else embedding.embed_mixed,
+ min_score=min_score,
+ limit=limit,
+ filters=query_filters,
+ )
+ search_results = {k: results.get(k, []) for k in modalities}
+
+ found_chunks = {
+ str(r.id): r for results in search_results.values() for r in results
+ }
+ with make_session() as db:
+ chunks = db.query(Chunk).filter(Chunk.id.in_(found_chunks.keys())).all()
+ return [
+ annotated_chunk(chunk, found_chunks[str(chunk.id)], previews or False)
+ for chunk in chunks
+ ]
diff --git a/src/memory/api/search/search.py b/src/memory/api/search/search.py
new file mode 100644
index 0000000..2e07d5d
--- /dev/null
+++ b/src/memory/api/search/search.py
@@ -0,0 +1,94 @@
+"""
+Search endpoints for the knowledge base API.
+"""
+
+import asyncio
+import logging
+from typing import Optional
+
+from memory.api.search.embeddings import search_embeddings
+from memory.api.search.bm25 import search_bm25
+from memory.api.search.utils import SearchFilters, SearchResult
+
+from memory.api.search.utils import group_chunks, with_timeout
+from memory.common import extract
+from memory.common.collections import (
+ ALL_COLLECTIONS,
+ MULTIMODAL_COLLECTIONS,
+ TEXT_COLLECTIONS,
+)
+
+logger = logging.getLogger(__name__)
+
+
+async def search(
+ data: list[extract.DataChunk],
+ previews: Optional[bool] = False,
+ modalities: set[str] = set(),
+ limit: int = 10,
+ min_text_score: float = 0.4,
+ min_multimodal_score: float = 0.25,
+ filters: SearchFilters = {},
+ timeout: int = 2,
+) -> list[SearchResult]:
+ """
+ Search across knowledge base using text query and optional files.
+
+ Parameters:
+ - query: Optional text search query
+ - modalities: List of modalities to search in (e.g., "text", "photo", "doc")
+ - files: Optional files to include in the search context
+ - limit: Maximum number of results per modality
+
+ Returns:
+ - List of search results sorted by score
+ """
+ allowed_modalities = modalities & ALL_COLLECTIONS.keys()
+
+ text_embeddings_results = with_timeout(
+ search_embeddings(
+ data,
+ previews,
+ allowed_modalities & TEXT_COLLECTIONS,
+ limit,
+ min_text_score,
+ filters,
+ multimodal=False,
+ ),
+ timeout,
+ )
+ multimodal_embeddings_results = with_timeout(
+ search_embeddings(
+ data,
+ previews,
+ allowed_modalities & MULTIMODAL_COLLECTIONS,
+ limit,
+ min_multimodal_score,
+ filters,
+ multimodal=True,
+ ),
+ timeout,
+ )
+ bm25_results = with_timeout(
+ search_bm25(
+ " ".join([c for chunk in data for c in chunk.data if isinstance(c, str)]),
+ modalities,
+ limit=limit,
+ filters=filters,
+ ),
+ timeout,
+ )
+
+ results = await asyncio.gather(
+ text_embeddings_results,
+ multimodal_embeddings_results,
+ bm25_results,
+ return_exceptions=False,
+ )
+ text_results, multi_results, bm25_results = results
+ all_results = text_results + multi_results
+ if len(all_results) < limit:
+ all_results += bm25_results
+
+ results = group_chunks(all_results, previews or False)
+ return sorted(results, key=lambda x: max(c.score for c in x.chunks), reverse=True)
diff --git a/src/memory/api/search/utils.py b/src/memory/api/search/utils.py
new file mode 100644
index 0000000..cf8fe20
--- /dev/null
+++ b/src/memory/api/search/utils.py
@@ -0,0 +1,134 @@
+import asyncio
+from datetime import datetime
+import logging
+from collections import defaultdict
+from typing import Optional, TypedDict, NotRequired
+
+from pydantic import BaseModel
+
+from memory.common import settings
+from memory.common.db.models import Chunk
+
+logger = logging.getLogger(__name__)
+
+
+class AnnotatedChunk(BaseModel):
+ id: str
+ score: float
+ metadata: dict
+ preview: Optional[str | None] = None
+ search_method: str | None = None
+
+
+class SourceData(BaseModel):
+ """Holds source item data to avoid SQLAlchemy session issues"""
+
+ id: int
+ size: int | None
+ mime_type: str | None
+ filename: str | None
+ content_length: int
+ contents: dict | None
+ created_at: datetime | None
+
+ @staticmethod
+ def from_chunk(chunk: Chunk) -> "SourceData":
+ source = chunk.source
+ display_contents = source.display_contents or {}
+ return SourceData(
+ id=source.id,
+ size=source.size,
+ mime_type=source.mime_type,
+ filename=source.filename,
+ content_length=len(source.content) if source.content else 0,
+ contents=display_contents,
+ created_at=source.inserted_at,
+ )
+
+
+class SearchResponse(BaseModel):
+ collection: str
+ results: list[dict]
+
+
+class SearchResult(BaseModel):
+ id: int
+ size: int
+ mime_type: str
+ chunks: list[AnnotatedChunk]
+ content: Optional[str | dict] = None
+ filename: Optional[str] = None
+ tags: list[str] | None = None
+ metadata: dict | None = None
+ created_at: datetime | None = None
+
+
+class SearchFilters(TypedDict):
+ subject: NotRequired[str | None]
+ confidence: NotRequired[float]
+ tags: NotRequired[list[str] | None]
+ observation_types: NotRequired[list[str] | None]
+ source_ids: NotRequired[list[int] | None]
+
+
+async def with_timeout(
+ call, timeout: int = 2
+) -> list[tuple[SourceData, AnnotatedChunk]]:
+ """
+ Run a function with a timeout.
+
+ Args:
+ call: The function to run
+ timeout: The timeout in seconds
+ """
+ try:
+ return await asyncio.wait_for(call, timeout=timeout)
+ except TimeoutError:
+ logger.warning(f"Search timed out after {timeout}s")
+ return []
+ except Exception as e:
+ logger.error(f"Search failed: {e}")
+ return []
+
+
+def group_chunks(
+ chunks: list[tuple[SourceData, AnnotatedChunk]], preview: bool = False
+) -> list[SearchResult]:
+ items = defaultdict(list)
+ source_lookup = {}
+
+ for source, chunk in chunks:
+ items[source.id].append(chunk)
+ source_lookup[source.id] = source
+
+ def get_content(text: str | dict | None) -> str | dict | None:
+ if preview or not text or not isinstance(text, str) or len(text) < 250:
+ return text
+
+ return text[:250] + "..."
+
+ def make_result(source: SourceData, chunks: list[AnnotatedChunk]) -> SearchResult:
+ contents = source.contents or {}
+ tags = contents.pop("tags", [])
+ content = contents.pop("content", None)
+
+ return SearchResult(
+ id=source.id,
+ size=source.size or source.content_length,
+ mime_type=source.mime_type or "text/plain",
+ filename=source.filename
+ and source.filename.replace(
+ str(settings.FILE_STORAGE_DIR).lstrip("/"), "/files"
+ ),
+ content=get_content(content),
+ tags=tags,
+ metadata=contents,
+ chunks=sorted(chunks, key=lambda x: x.score, reverse=True),
+ created_at=source.created_at,
+ )
+
+ return [
+ make_result(source, chunks)
+ for source_id, chunks in items.items()
+ for source in [source_lookup[source_id]]
+ ]
diff --git a/src/memory/common/collections.py b/src/memory/common/collections.py
index f2aee53..98fc2cc 100644
--- a/src/memory/common/collections.py
+++ b/src/memory/common/collections.py
@@ -102,6 +102,7 @@ TEXT_COLLECTIONS = {
MULTIMODAL_COLLECTIONS = {
coll for coll, params in ALL_COLLECTIONS.items() if params.get("multimodal")
}
+OBSERVATION_COLLECTIONS = {"semantic", "temporal"}
TYPES = {
"doc": ["application/pdf", "application/docx", "application/msword"],
diff --git a/src/memory/common/db/models/source_item.py b/src/memory/common/db/models/source_item.py
index 56e9a44..798954e 100644
--- a/src/memory/common/db/models/source_item.py
+++ b/src/memory/common/db/models/source_item.py
@@ -84,7 +84,7 @@ def clean_filename(filename: str) -> str:
def image_filenames(chunk_id: str, images: list[Image.Image]) -> list[str]:
for i, image in enumerate(images):
- if not image.filename: # type: ignore
+ if not getattr(image, "filename", None): # type: ignore
filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}_{i}.{image.format}" # type: ignore
image.save(filename)
image.filename = str(filename) # type: ignore
@@ -100,16 +100,6 @@ def add_pics(chunk: str, images: list[Image.Image]) -> list[extract.MulitmodalCh
]
-def merge_metadata(*metadata: dict[str, Any]) -> dict[str, Any]:
- final = {}
- for m in metadata:
- data = m.copy()
- if tags := set(data.pop("tags", [])):
- final["tags"] = tags | final.get("tags", set())
- final |= data
- return final
-
-
def chunk_mixed(content: str, image_paths: Sequence[str]) -> list[extract.DataChunk]:
if not content.strip():
return []
@@ -241,14 +231,11 @@ class SourceItem(Base):
return [chunk.id for chunk in self.chunks]
def _chunk_contents(self) -> Sequence[extract.DataChunk]:
- chunks: list[extract.DataChunk] = []
content = cast(str | None, self.content)
if content:
- chunks = [extract.DataChunk(data=[c]) for c in chunker.chunk_text(content)]
-
- if content and len(content) > chunker.DEFAULT_CHUNK_TOKENS * 2:
- summary, tags = summarizer.summarize(content)
- chunks.append(extract.DataChunk(data=[summary], metadata={"tags": tags}))
+ chunks = extract.extract_text(content)
+ else:
+ chunks = []
mime_type = cast(str | None, self.mime_type)
if mime_type and mime_type.startswith("image/"):
@@ -272,12 +259,14 @@ class SourceItem(Base):
file_paths=image_names,
collection_name=modality,
embedding_model=collections.collection_model(modality, text, images),
- item_metadata=merge_metadata(self.as_payload(), data.metadata, metadata),
+ item_metadata=extract.merge_metadata(
+ self.as_payload(), data.metadata, metadata
+ ),
)
return chunk
def data_chunks(self, metadata: dict[str, Any] = {}) -> Sequence[Chunk]:
- return [self._make_chunk(data) for data in self._chunk_contents()]
+ return [self._make_chunk(data, metadata) for data in self._chunk_contents()]
def as_payload(self) -> dict:
return {
@@ -291,4 +280,7 @@ class SourceItem(Base):
return {
"tags": self.tags,
"size": self.size,
+ "content": self.content,
+ "filename": self.filename,
+ "mime_type": self.mime_type,
}
diff --git a/src/memory/common/db/models/source_items.py b/src/memory/common/db/models/source_items.py
index 985a2a6..1b1dac5 100644
--- a/src/memory/common/db/models/source_items.py
+++ b/src/memory/common/db/models/source_items.py
@@ -33,7 +33,6 @@ from memory.common.db.models.source_item import (
SourceItem,
Chunk,
clean_filename,
- merge_metadata,
chunk_mixed,
)
@@ -326,27 +325,24 @@ class BookSection(SourceItem):
}
return {k: v for k, v in vals.items() if v}
- def data_chunks(self, metadata: dict[str, Any] = {}) -> Sequence[Chunk]:
+ def _chunk_contents(self) -> Sequence[extract.DataChunk]:
content = cast(str, self.content.strip())
if not content:
return []
if len([p for p in self.pages if p.strip()]) == 1:
- return [
- self._make_chunk(
- extract.DataChunk(data=[content]), metadata | {"type": "page"}
- )
- ]
+ chunks = extract.extract_text(content, metadata={"type": "page"})
+ if len(chunks) > 1:
+ chunks[-1].metadata["type"] = "summary"
+ return chunks
summary, tags = summarizer.summarize(content)
return [
- self._make_chunk(
- extract.DataChunk(data=[content]),
- merge_metadata(metadata, {"type": "section", "tags": tags}),
+ extract.DataChunk(
+ data=[content], metadata={"type": "section", "tags": tags}
),
- self._make_chunk(
- extract.DataChunk(data=[summary]),
- merge_metadata(metadata, {"type": "summary", "tags": tags}),
+ extract.DataChunk(
+ data=[summary], metadata={"type": "summary", "tags": tags}
),
]
@@ -596,7 +592,7 @@ class AgentObservation(SourceItem):
)
semantic_chunk = extract.DataChunk(
data=[semantic_text],
- metadata=merge_metadata(metadata, {"embedding_type": "semantic"}),
+ metadata=extract.merge_metadata(metadata, {"embedding_type": "semantic"}),
modality="semantic",
)
@@ -609,7 +605,7 @@ class AgentObservation(SourceItem):
)
temporal_chunk = extract.DataChunk(
data=[temporal_text],
- metadata=merge_metadata(metadata, {"embedding_type": "temporal"}),
+ metadata=extract.merge_metadata(metadata, {"embedding_type": "temporal"}),
modality="temporal",
)
@@ -617,14 +613,14 @@ class AgentObservation(SourceItem):
self._make_chunk(
extract.DataChunk(
data=[i],
- metadata=merge_metadata(metadata, {"embedding_type": "semantic"}),
+ metadata=extract.merge_metadata(
+ metadata, {"embedding_type": "semantic"}
+ ),
modality="semantic",
)
)
for i in [
self.content,
- self.subject,
- self.observation_type,
self.evidence.get("quote", ""),
]
if i
diff --git a/src/memory/common/embedding.py b/src/memory/common/embedding.py
index 44ae4ea..694d04d 100644
--- a/src/memory/common/embedding.py
+++ b/src/memory/common/embedding.py
@@ -1,5 +1,5 @@
import logging
-from typing import Iterable, Literal, cast
+from typing import Literal, cast
import voyageai
@@ -15,12 +15,22 @@ from memory.common.db.models import Chunk, SourceItem
logger = logging.getLogger(__name__)
+def as_string(
+ chunk: extract.MulitmodalChunk | list[extract.MulitmodalChunk],
+) -> str:
+ if isinstance(chunk, str):
+ return chunk.strip()
+ if isinstance(chunk, list):
+ return "\n".join(as_string(i) for i in chunk).strip()
+ return ""
+
+
def embed_chunks(
chunks: list[list[extract.MulitmodalChunk]],
model: str = settings.TEXT_EMBEDDING_MODEL,
input_type: Literal["document", "query"] = "document",
) -> list[Vector]:
- logger.debug(f"Embedding chunks: {model} - {str(chunks)[:100]} {len(chunks)}")
+ logger.debug(f"Embedding chunks: {model} - {str(chunks)} {len(chunks)}")
vo = voyageai.Client() # type: ignore
if model == settings.MIXED_EMBEDDING_MODEL:
return vo.multimodal_embed(
@@ -29,17 +39,18 @@ def embed_chunks(
input_type=input_type,
).embeddings
- texts = ["\n".join(i for i in c if isinstance(i, str)) for c in chunks]
+ texts = [as_string(c) for c in chunks]
+ logger.debug(f"Embedding texts: {texts}")
return cast(
list[Vector], vo.embed(texts, model=model, input_type=input_type).embeddings
)
def break_chunk(
- chunk: list[extract.MulitmodalChunk], chunk_size: int = DEFAULT_CHUNK_TOKENS
+ chunk: extract.DataChunk, chunk_size: int = DEFAULT_CHUNK_TOKENS
) -> list[extract.MulitmodalChunk]:
result = []
- for c in chunk:
+ for c in chunk.data:
if isinstance(c, str):
result += chunk_text(c, chunk_size, OVERLAP_TOKENS)
else:
@@ -48,12 +59,12 @@ def break_chunk(
def embed_text(
- chunks: list[list[extract.MulitmodalChunk]],
+ chunks: list[extract.DataChunk],
model: str = settings.TEXT_EMBEDDING_MODEL,
input_type: Literal["document", "query"] = "document",
chunk_size: int = DEFAULT_CHUNK_TOKENS,
) -> list[Vector]:
- chunked_chunks = [break_chunk(chunk, chunk_size) for chunk in chunks]
+ chunked_chunks = [break_chunk(chunk, chunk_size) for chunk in chunks if chunk.data]
if not any(chunked_chunks):
return []
@@ -61,12 +72,12 @@ def embed_text(
def embed_mixed(
- items: list[list[extract.MulitmodalChunk]],
+ items: list[extract.DataChunk],
model: str = settings.MIXED_EMBEDDING_MODEL,
input_type: Literal["document", "query"] = "document",
chunk_size: int = DEFAULT_CHUNK_TOKENS,
) -> list[Vector]:
- chunked_chunks = [break_chunk(item, chunk_size) for item in items]
+ chunked_chunks = [break_chunk(item, chunk_size) for item in items if item.data]
return embed_chunks(chunked_chunks, model, input_type)
diff --git a/src/memory/common/extract.py b/src/memory/common/extract.py
index 18880eb..3df07df 100644
--- a/src/memory/common/extract.py
+++ b/src/memory/common/extract.py
@@ -6,7 +6,7 @@ import tempfile
from contextlib import contextmanager
from typing import Any, Generator, Sequence, cast
-from memory.common import chunker
+from memory.common import chunker, summarizer
import pymupdf # PyMuPDF
import pypandoc
from PIL import Image
@@ -16,6 +16,16 @@ logger = logging.getLogger(__name__)
MulitmodalChunk = Image.Image | str
+def merge_metadata(*metadata: dict[str, Any]) -> dict[str, Any]:
+ final = {}
+ for m in metadata:
+ data = m.copy()
+ if tags := set(data.pop("tags", []) or []):
+ final["tags"] = tags | final.get("tags", set())
+ final |= data
+ return final
+
+
@dataclass
class DataChunk:
data: Sequence[MulitmodalChunk]
@@ -109,7 +119,9 @@ def extract_image(content: bytes | str | pathlib.Path) -> list[DataChunk]:
def extract_text(
- content: bytes | str | pathlib.Path, chunk_size: int | None = None
+ content: bytes | str | pathlib.Path,
+ chunk_size: int | None = None,
+ metadata: dict[str, Any] = {},
) -> list[DataChunk]:
if isinstance(content, pathlib.Path):
content = content.read_text()
@@ -117,8 +129,20 @@ def extract_text(
content = content.decode("utf-8")
content = cast(str, content)
- chunks = chunker.chunk_text(content, chunk_size or chunker.DEFAULT_CHUNK_TOKENS)
- return [DataChunk(data=[c], mime_type="text/plain") for c in chunks if c.strip()]
+ chunks = [
+ DataChunk(data=[c], modality="text", metadata=metadata)
+ for c in chunker.chunk_text(content, chunk_size or chunker.DEFAULT_CHUNK_TOKENS)
+ ]
+ if content and len(content) > chunker.DEFAULT_CHUNK_TOKENS * 2:
+ summary, tags = summarizer.summarize(content)
+ chunks.append(
+ DataChunk(
+ data=[summary],
+ metadata=merge_metadata(metadata, {"tags": tags}),
+ modality="text",
+ )
+ )
+ return chunks
def extract_data_chunks(
diff --git a/src/memory/common/qdrant.py b/src/memory/common/qdrant.py
index 106ee22..5e0121e 100644
--- a/src/memory/common/qdrant.py
+++ b/src/memory/common/qdrant.py
@@ -3,7 +3,7 @@ from typing import Any, cast, Generator, Sequence
import qdrant_client
from qdrant_client.http import models as qdrant_models
-from qdrant_client.http.exceptions import UnexpectedResponse
+from qdrant_client.http.exceptions import UnexpectedResponse, ApiException
from memory.common import settings
from memory.common.collections import ALL_COLLECTIONS, Collection, DistanceType, Vector
@@ -193,14 +193,18 @@ def delete_points(
collection_name: Name of the collection
ids: List of vector IDs to delete
"""
- client.delete(
- collection_name=collection_name,
- points_selector=qdrant_models.PointIdsList(
- points=ids, # type: ignore
- ),
- )
+ try:
+ client.delete(
+ collection_name=collection_name,
+ points_selector=qdrant_models.PointIdsList(
+ points=ids, # type: ignore
+ ),
+ )
- logger.debug(f"Deleted {len(ids)} vectors from {collection_name}")
+ logger.debug(f"Deleted {len(ids)} vectors from {collection_name}")
+ except (ApiException, UnexpectedResponse) as e:
+ logger.error(f"Error deleting points from {collection_name}: {e}")
+ raise IOError(f"Error deleting points from {collection_name}: {e}")
def get_collection_info(
diff --git a/src/memory/common/summarizer.py b/src/memory/common/summarizer.py
index 8d6fff6..981ced6 100644
--- a/src/memory/common/summarizer.py
+++ b/src/memory/common/summarizer.py
@@ -1,5 +1,6 @@
import json
import logging
+import traceback
from typing import Any
from memory.common import settings, chunker
@@ -131,6 +132,7 @@ def summarize(content: str, target_tokens: int | None = None) -> tuple[str, list
summary = result.get("summary", "")
tags = result.get("tags", [])
except Exception as e:
+ traceback.print_exc()
logger.error(f"Summarization failed: {e}")
tokens = chunker.approx_token_count(summary)
diff --git a/src/memory/workers/tasks/ebook.py b/src/memory/workers/tasks/ebook.py
index 189f243..16cafdd 100644
--- a/src/memory/workers/tasks/ebook.py
+++ b/src/memory/workers/tasks/ebook.py
@@ -60,6 +60,7 @@ def section_processor(
end_page=section.end_page,
parent_section_id=None, # Will be set after flush
content=content,
+ filename=book.file_path,
size=len(content),
mime_type="text/plain",
sha256=create_content_hash(
diff --git a/src/memory/workers/tasks/maintenance.py b/src/memory/workers/tasks/maintenance.py
index 998de9b..c831b11 100644
--- a/src/memory/workers/tasks/maintenance.py
+++ b/src/memory/workers/tasks/maintenance.py
@@ -21,6 +21,7 @@ REINGEST_MISSING_CHUNKS = f"{MAINTENANCE_ROOT}.reingest_missing_chunks"
REINGEST_CHUNK = f"{MAINTENANCE_ROOT}.reingest_chunk"
REINGEST_ITEM = f"{MAINTENANCE_ROOT}.reingest_item"
REINGEST_EMPTY_SOURCE_ITEMS = f"{MAINTENANCE_ROOT}.reingest_empty_source_items"
+REINGEST_ALL_EMPTY_SOURCE_ITEMS = f"{MAINTENANCE_ROOT}.reingest_all_empty_source_items"
UPDATE_METADATA_FOR_SOURCE_ITEMS = (
f"{MAINTENANCE_ROOT}.update_metadata_for_source_items"
)
@@ -76,9 +77,9 @@ def reingest_chunk(chunk_id: str, collection: str):
data = chunk.data
if collection in collections.MULTIMODAL_COLLECTIONS:
- vector = embedding.embed_mixed(data)[0]
- elif len(data) == 1 and isinstance(data[0], str):
- vector = embedding.embed_text([data[0]])[0]
+ vector = embedding.embed_mixed([data])[0]
+ elif collection in collections.TEXT_COLLECTIONS:
+ vector = embedding.embed_text([data])[0]
else:
raise ValueError(f"Unsupported data type for collection {collection}")
@@ -123,7 +124,10 @@ def reingest_item(item_id: str, item_type: str):
chunk_ids = [str(c.id) for c in item.chunks if c.id]
if chunk_ids:
client = qdrant.get_qdrant_client()
- qdrant.delete_points(client, item.modality, chunk_ids)
+ try:
+ qdrant.delete_points(client, item.modality, chunk_ids)
+ except IOError as e:
+ logger.error(f"Error deleting chunks for {item_id}: {e}")
for chunk in item.chunks:
session.delete(chunk)
@@ -151,6 +155,13 @@ def reingest_empty_source_items(item_type: str):
return {"status": "success", "items": len(item_ids)}
+@app.task(name=REINGEST_ALL_EMPTY_SOURCE_ITEMS)
+def reingest_all_empty_source_items():
+ logger.info("Reingesting all empty source items")
+ for item_type in SourceItem.registry._class_registry.keys():
+ reingest_empty_source_items.delay(item_type) # type: ignore
+
+
def check_batch(batch: Sequence[Chunk]) -> dict:
client = qdrant.get_qdrant_client()
by_collection = defaultdict(list)
diff --git a/tests/conftest.py b/tests/conftest.py
index aae9b51..e90eec2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -234,8 +234,10 @@ def mock_voyage_client():
def embeder(chunks, *args, **kwargs):
return Mock(embeddings=[[0.1] * 1024] * len(chunks))
+ real_client = voyageai.Client
with patch.object(voyageai, "Client", autospec=True) as mock_client:
client = mock_client()
+ client.real_client = real_client
client.embed = embeder
client.multimodal_embed = embeder
yield client
@@ -251,7 +253,7 @@ def mock_openai_client():
choices=[
Mock(
message=Mock(
- content='{"summary": "test", "tags": ["tag1", "tag2"]}'
+ content='{"summary": "test summary", "tags": ["tag1", "tag2"]}'
)
)
]
@@ -267,7 +269,9 @@ def mock_anthropic_client():
client.messages = Mock()
client.messages.create = Mock(
return_value=Mock(
- content=[Mock(text='{"summary": "test", "tags": ["tag1", "tag2"]}')]
+ content=[
+ Mock(text='{"summary": "test summary", "tags": ["tag1", "tag2"]}')
+ ]
)
)
yield client
diff --git a/tests/data/code_complexity.jpg b/tests/data/code_complexity.jpg
new file mode 100644
index 0000000..0d0cb0e
Binary files /dev/null and b/tests/data/code_complexity.jpg differ
diff --git a/tests/data/contents.py b/tests/data/contents.py
new file mode 100644
index 0000000..44ce152
--- /dev/null
+++ b/tests/data/contents.py
@@ -0,0 +1,249 @@
+import hashlib
+import pathlib
+from bs4 import BeautifulSoup
+from markdownify import markdownify
+from PIL import Image
+
+DATA_DIR = pathlib.Path(__file__).parent
+
+SAMPLE_HTML = f"""
+
+
+ The Evolution of Programming Languages
+
+ Programming languages have undergone tremendous evolution since the early days of computing.
+ From the machine code and assembly languages of the 1940s to the high-level, expressive languages
+ we use today, each generation has built upon the lessons learned from its predecessors. Languages
+ like FORTRAN and COBOL pioneered the concept of human-readable code, while later innovations like
+ object-oriented programming in languages such as Smalltalk and C++ revolutionized how we structure
+ and organize our programs.
+
+
+
+ The rise of functional programming paradigms has brought mathematical rigor and immutability
+ to the forefront of software development. Languages like Haskell, Lisp, and more recently Rust
+ and Elm have demonstrated the power of pure functions and type systems in creating more reliable
+ and maintainable code. These paradigms emphasize the elimination of side effects and the treatment
+ of computation as the evaluation of mathematical functions.
+ patch
+ Modern development has also seen the emergence of domain-specific languages and the resurgence
+ of interest in memory safety. The advent of languages like Python and JavaScript has democratized
+ programming by lowering the barrier to entry, while systems languages like Rust have proven that
+ performance and safety need not be mutually exclusive. The ongoing development of WebAssembly
+ promises to bring high-performance computing to web browsers in ways previously unimaginable.
+
+
+
+ Looking toward the future, we see emerging trends in quantum programming languages, AI-assisted
+ code generation, and the continued evolution toward more expressive type systems. The challenge
+ for tomorrow's language designers will be balancing expressiveness with simplicity, performance
+ with safety, and innovation with backward compatibility. As computing continues to permeate every
+ aspect of human life, the languages we use to command these machines will undoubtedly continue
+ to evolve and shape the digital landscape.
+
+ The emergence of cloud computing and distributed systems has also driven new paradigms in
+ language design. Languages like Go and Elixir have been specifically crafted to excel in
+ concurrent and distributed environments, while the rise of microservices has renewed interest
+ in polyglot programming approaches. These developments reflect a broader shift toward languages
+ that are not just powerful tools for individual developers, but robust foundations for building
+ scalable, resilient systems that can handle the demands of modern internet-scale applications.
+
+ Perhaps most intriguingly, the intersection of programming languages with artificial intelligence
+ is opening entirely new frontiers. Differentiable programming languages are enabling new forms of
+ machine learning research, while large language models are beginning to reshape how we think about
+ code generation and developer tooling. As we stand on the brink of an era where AI systems may
+ become active participants in the programming process itself, the very nature of what constitutes
+ a programming language—and who or what programs in it—may be fundamentally transformed.
+
+
+"""
+SECOND_PAGE = """
+
+
The Impact of Open Source on Language Development
+
+
The open source movement has fundamentally transformed how programming languages are developed,
+ distributed, and evolved. Unlike the proprietary languages of earlier decades, modern language
+ development often occurs in public repositories where thousands of contributors can participate
+ in the design process. Languages like Python, JavaScript, and Rust have benefited enormously
+ from this collaborative approach, with their ecosystems growing rapidly through community-driven
+ package managers and extensive third-party libraries.
+
+
This democratization of language development has led to faster innovation cycles and more
+ responsive adaptation to developer needs. When a language feature proves problematic or a new
+ paradigm emerges, open source languages can quickly incorporate changes through their community
+ governance processes. The result has been an unprecedented period of language experimentation
+ and refinement, where ideas can be tested, refined, and adopted across multiple language
+ communities simultaneously.
+
+
Furthermore, the open source model has enabled the rise of domain-specific languages that
+ might never have been commercially viable under traditional development models. From specialized
+ query languages for databases to configuration management tools, the low barrier to entry for
+ language creation has fostered an explosion of linguistic diversity in computing, each tool
+ optimized for specific problem domains and user communities.
+
+
The collaborative nature of open source development has also revolutionized language tooling
+ and developer experience. Modern languages benefit from rich ecosystems of editors, debuggers,
+ profilers, and static analysis tools, all developed by passionate communities who understand
+ the daily challenges faced by practitioners. This has created a virtuous cycle where better
+ tooling attracts more developers, who in turn contribute improvements that make the language
+ even more accessible and powerful.
+
+
Version control systems like Git have enabled unprecedented transparency in language evolution,
+ allowing developers to trace the reasoning behind every design decision through detailed commit
+ histories and issue discussions. This historical record serves not only as documentation but as
+ a learning resource for future language designers, helping them understand the trade-offs and
+ considerations that shaped successful language features.
+
+
The economic implications of open source language development cannot be overstated. By removing
+ licensing barriers and vendor lock-in, open source languages have democratized access to powerful
+ programming tools across the globe. This has enabled innovation in regions and sectors that might
+ otherwise have been excluded from the software revolution, fostering a truly global community of
+ software creators and problem solvers.
+
+"""
+
+CHUNKS: list[str] = [
+ """The Evolution of Programming Languages
+======================================
+Programming languages have undergone tremendous evolution since the early days of computing.
+ From the machine code and assembly languages of the 1940s to the high\\-level, expressive languages
+ we use today, each generation has built upon the lessons learned from its predecessors. Languages
+ like FORTRAN and COBOL pioneered the concept of human\\-readable code, while later innovations like
+ object\\-oriented programming in languages such as Smalltalk and C\\+\\+ revolutionized how we structure
+ and organize our programs.
+
+The rise of functional programming paradigms has brought mathematical rigor and immutability
+ to the forefront of software development. Languages like Haskell, Lisp, and more recently Rust
+ and Elm have demonstrated the power of pure functions and type systems in creating more reliable
+ and maintainable code. These paradigms emphasize the elimination of side effects and the treatment
+ of computation as the evaluation of mathematical functions.
+Modern development has also seen the emergence of domain\\-specific languages and the resurgence
+ of interest in memory safety. The advent of languages like Python and JavaScript has democratized
+ programming by lowering the barrier to entry, while systems languages like Rust have proven that
+ performance and safety need not be mutually exclusive. The ongoing development of WebAssembly
+ promises to bring high\\-performance computing to web browsers in ways previously unimaginable.
+
+Looking toward the future, we see emerging trends in quantum programming languages, AI\\-assisted
+ code generation, and the continued evolution toward more expressive type systems. The challenge
+ for tomorrow's language designers will be balancing expressiveness with simplicity, performance
+ with safety, and innovation with backward compatibility. As computing continues to permeate every
+ aspect of human life, the languages we use to command these machines will undoubtedly continue
+ to evolve and shape the digital landscape.""",
+ """
+As computing continues to permeate every
+ aspect of human life, the languages we use to command these machines will undoubtedly continue
+ to evolve and shape the digital landscape.
+The emergence of cloud computing and distributed systems has also driven new paradigms in
+ language design. Languages like Go and Elixir have been specifically crafted to excel in
+ concurrent and distributed environments, while the rise of microservices has renewed interest
+ in polyglot programming approaches. These developments reflect a broader shift toward languages
+ that are not just powerful tools for individual developers, but robust foundations for building
+ scalable, resilient systems that can handle the demands of modern internet\\-scale applications.
+Perhaps most intriguingly, the intersection of programming languages with artificial intelligence
+ is opening entirely new frontiers. Differentiable programming languages are enabling new forms of
+ machine learning research, while large language models are beginning to reshape how we think about
+ code generation and developer tooling. As we stand on the brink of an era where AI systems may
+ become active participants in the programming process itself, the very nature of what constitutes
+ a programming language—and who or what programs in it—may be fundamentally transformed.""",
+]
+TWO_PAGE_CHUNKS: list[str] = [
+ """
+The Evolution of Programming Languages
+======================================
+Programming languages have undergone tremendous evolution since the early days of computing.
+ From the machine code and assembly languages of the 1940s to the high\-level, expressive languages
+ we use today, each generation has built upon the lessons learned from its predecessors. Languages
+ like FORTRAN and COBOL pioneered the concept of human\-readable code, while later innovations like
+ object\-oriented programming in languages such as Smalltalk and C\+\+ revolutionized how we structure
+ and organize our programs.
+
+The rise of functional programming paradigms has brought mathematical rigor and immutability
+ to the forefront of software development. Languages like Haskell, Lisp, and more recently Rust
+ and Elm have demonstrated the power of pure functions and type systems in creating more reliable
+ and maintainable code. These paradigms emphasize the elimination of side effects and the treatment
+ of computation as the evaluation of mathematical functions.
+Modern development has also seen the emergence of domain\-specific languages and the resurgence
+ of interest in memory safety. The advent of languages like Python and JavaScript has democratized
+ programming by lowering the barrier to entry, while systems languages like Rust have proven that
+ performance and safety need not be mutually exclusive. The ongoing development of WebAssembly
+ promises to bring high\-performance computing to web browsers in ways previously unimaginable.
+
+Looking toward the future, we see emerging trends in quantum programming languages, AI\-assisted
+ code generation, and the continued evolution toward more expressive type systems. The challenge
+ for tomorrow's language designers will be balancing expressiveness with simplicity, performance
+ with safety, and innovation with backward compatibility. As computing continues to permeate every
+ aspect of human life, the languages we use to command these machines will undoubtedly continue
+ to evolve and shape the digital landscape.
+""",
+ """
+As computing continues to permeate every
+ aspect of human life, the languages we use to command these machines will undoubtedly continue
+ to evolve and shape the digital landscape.
+The emergence of cloud computing and distributed systems has also driven new paradigms in
+ language design. Languages like Go and Elixir have been specifically crafted to excel in
+ concurrent and distributed environments, while the rise of microservices has renewed interest
+ in polyglot programming approaches. These developments reflect a broader shift toward languages
+ that are not just powerful tools for individual developers, but robust foundations for building
+ scalable, resilient systems that can handle the demands of modern internet\-scale applications.
+Perhaps most intriguingly, the intersection of programming languages with artificial intelligence
+ is opening entirely new frontiers. Differentiable programming languages are enabling new forms of
+ machine learning research, while large language models are beginning to reshape how we think about
+ code generation and developer tooling. As we stand on the brink of an era where AI systems may
+ become active participants in the programming process itself, the very nature of what constitutes
+ a programming language—and who or what programs in it—may be fundamentally transformed.
+The Impact of Open Source on Language Development
+-------------------------------------------------
+The open source movement has fundamentally transformed how programming languages are developed,
+ distributed, and evolved. Unlike the proprietary languages of earlier decades, modern language
+ development often occurs in public repositories where thousands of contributors can participate
+ in the design process. Languages like Python, JavaScript, and Rust have benefited enormously
+ from this collaborative approach, with their ecosystems growing rapidly through community\-driven
+ package managers and extensive third\-party libraries.
+This democratization of language development has led to faster innovation cycles and more
+ responsive adaptation to developer needs. When a language feature proves problematic or a new
+ paradigm emerges, open source languages can quickly incorporate changes through their community
+ governance processes. The result has been an unprecedented period of language experimentation
+ and refinement, where ideas can be tested, refined, and adopted across multiple language
+ communities simultaneously.""",
+ """
+The result has been an unprecedented period of language experimentation
+ and refinement, where ideas can be tested, refined, and adopted across multiple language
+ communities simultaneously.
+Furthermore, the open source model has enabled the rise of domain\-specific languages that
+ might never have been commercially viable under traditional development models. From specialized
+ query languages for databases to configuration management tools, the low barrier to entry for
+ language creation has fostered an explosion of linguistic diversity in computing, each tool
+ optimized for specific problem domains and user communities.
+The collaborative nature of open source development has also revolutionized language tooling
+ and developer experience. Modern languages benefit from rich ecosystems of editors, debuggers,
+ profilers, and static analysis tools, all developed by passionate communities who understand
+ the daily challenges faced by practitioners. This has created a virtuous cycle where better
+ tooling attracts more developers, who in turn contribute improvements that make the language
+ even more accessible and powerful.
+Version control systems like Git have enabled unprecedented transparency in language evolution,
+ allowing developers to trace the reasoning behind every design decision through detailed commit
+ histories and issue discussions. This historical record serves not only as documentation but as
+ a learning resource for future language designers, helping them understand the trade\-offs and
+ considerations that shaped successful language features.
+The economic implications of open source language development cannot be overstated. By removing
+ licensing barriers and vendor lock\-in, open source languages have democratized access to powerful
+ programming tools across the globe. This has enabled innovation in regions and sectors that might
+ otherwise have been excluded from the software revolution, fostering a truly global community of
+ software creators and problem solvers.
+""",
+]
+
+SAMPLE_MARKDOWN = markdownify(SAMPLE_HTML)
+SAMPLE_TEXT = BeautifulSoup(SAMPLE_HTML, "html.parser").get_text()
+SECOND_PAGE_MARKDOWN = markdownify(SECOND_PAGE)
+SECOND_PAGE_TEXT = BeautifulSoup(SECOND_PAGE, "html.parser").get_text()
+
+
+def image_hash(image: Image.Image) -> str:
+ return hashlib.sha256(image.tobytes()).hexdigest()
+
+
+LANG_TIMELINE = Image.open(DATA_DIR / "lang_timeline.png")
+CODE_COMPLEXITY = Image.open(DATA_DIR / "code_complexity.jpg")
+LANG_TIMELINE_HASH = image_hash(LANG_TIMELINE)
+CODE_COMPLEXITY_HASH = image_hash(CODE_COMPLEXITY)
diff --git a/tests/data/lang_timeline.png b/tests/data/lang_timeline.png
new file mode 100644
index 0000000..5201eb9
Binary files /dev/null and b/tests/data/lang_timeline.png differ
diff --git a/tests/integration/test_real_queries.py b/tests/integration/test_real_queries.py
new file mode 100644
index 0000000..36f0415
--- /dev/null
+++ b/tests/integration/test_real_queries.py
@@ -0,0 +1,1118 @@
+import hashlib
+import itertools
+from datetime import datetime
+from unittest.mock import patch
+
+import pytest
+import voyageai
+
+import memory.common.qdrant as qdrant_tools
+from memory.common import extract
+from memory.common.db.models.source_item import SourceItem
+from memory.common.db.models.source_items import (
+ AgentObservation,
+)
+from memory.common.embedding import embed_source_item, embed_text
+from memory.workers.tasks.content_processing import push_to_qdrant
+from tests.data.contents import SAMPLE_MARKDOWN
+
+
+@pytest.fixture
+def real_voyage_client(mock_voyage_client):
+ real_client = mock_voyage_client.real_client
+ with patch.object(voyageai, "Client", real_client):
+ yield real_client
+
+
+def test_real_source_item_embeddings(real_voyage_client, qdrant):
+ item = SourceItem(
+ id=1,
+ content=SAMPLE_MARKDOWN,
+ mime_type="text/html",
+ modality="text",
+ sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
+ size=len(SAMPLE_MARKDOWN),
+ tags=["bla"],
+ embed_status="QUEUED",
+ )
+
+ part1, part2, summary = embed_source_item(item)
+ part1.id = "00000000-0000-0000-0000-000000000000" # type: ignore
+ part2.id = "00000000-0000-0000-0000-000000000001" # type: ignore
+ summary.id = "00000000-0000-0000-0000-000000000002" # type: ignore
+ push_to_qdrant([item])
+
+ queries = {
+ "how have programming languages changed?": [0.6756747, 0.6319432, 0.26348075],
+ "evolution of programming languages since 1940s": [0.690, 0.594, 0.330],
+ "functional programming paradigms and immutability": [0.507, 0.412, 0.276],
+ "memory safety in systems programming languages": [0.487, 0.458, 0.348],
+ "FORTRAN and COBOL pioneering human-readable code": [0.535, 0.458, 0.296],
+ "Rust and type systems for reliable code": [0.585, 0.506, 0.456],
+ "WebAssembly high-performance web computing": [0.469, 0.426, 0.296],
+ "object-oriented programming innovations": [0.510, 0.492, 0.333],
+ "cloud computing and distributed systems": [0.40005407, 0.56048, 0.37348732],
+ "AI-assisted code generation trends": [0.51078045, 0.5828345, 0.31309962],
+ "microservices and polyglot programming": [0.5072756, 0.63991153, 0.38507754],
+ "Python JavaScript democratizing programming": [0.524, 0.517, 0.320],
+ "software development methodologies": [0.454, 0.440, 0.356],
+ "computer science history": [0.517, 0.454, 0.299],
+ "programming paradigms comparison": [0.589, 0.525, 0.352],
+ "developer tools and ecosystems": [0.42297083, 0.52246743, 0.39521465],
+ "modern computing trends": [0.5172996, 0.5883902, 0.30886292],
+ "database query languages": [0.47420773, 0.48987937, 0.41980737],
+ "network programming protocols": [0.3547029, 0.42228842, 0.39325726],
+ "machine learning algorithms": [0.39660394, 0.47512275, 0.45423454],
+ "web browser technologies": [0.467, 0.449, 0.439],
+ "software architecture patterns": [0.4430701, 0.4969077, 0.3775082],
+ "mobile app user interface design": [0.2754, 0.332, 0.3863],
+ "cybersecurity threat detection": [0.3436677, 0.38349956, 0.36111486],
+ "project management methodologies": [0.3377, 0.34, 0.3573],
+ "cooking Italian pasta recipes": [0.2627, 0.2388, 0.3065],
+ "professional basketball statistics": [0.2811, 0.2454, 0.3411],
+ "gardening tips for beginners": [0.2953, 0.2848, 0.3309],
+ "travel destinations in Europe": [0.2595, 0.2514, 0.3039],
+ "classical music composers": [0.3066, 0.2838, 0.3173],
+ }
+ for query, (p1, p2, s) in queries.items():
+ search_vector = embed_text(
+ [extract.DataChunk(data=[query])], input_type="query"
+ )[0]
+ results = qdrant_tools.search_vectors(qdrant, "text", search_vector)
+ expected = sorted(
+ [
+ (part1.id, p1),
+ (part2.id, p2),
+ (summary.id, s),
+ ],
+ key=lambda x: x[1],
+ reverse=True,
+ )
+ assert [(r.id, pytest.approx(r.score, abs=0.1)) for r in results] == expected
+
+
+EXPECTED_OBSERVATION_RESULTS = {
+ "What does the user think about functional programming?": {
+ "semantic": [
+ (
+ 0.7104,
+ "The user believes functional programming leads to better code quality",
+ ),
+ (0.6792, "I prefer functional programming over OOP"),
+ (
+ 0.6772,
+ "Subject: programming_philosophy | Type: belief | Observation: The user believes functional programming leads to better code quality | Quote: Functional programming produces more maintainable code",
+ ),
+ (
+ 0.6677,
+ "Subject: programming_paradigms | Type: preference | Observation: The user prefers functional programming over OOP | Quote: I prefer functional programming over OOP",
+ ),
+ ],
+ "temporal": [
+ (
+ 0.5816,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8",
+ ),
+ (
+ 0.5246,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8",
+ ),
+ (
+ 0.5214,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8",
+ ),
+ (
+ 0.4645,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: refactoring | Observation: The user always refactors to pure functions | Confidence: 0.8",
+ ),
+ ],
+ },
+ "Does the user prefer functional or object-oriented programming?": {
+ "semantic": [
+ (0.7718, "The user prefers functional programming over OOP"),
+ (
+ 0.754,
+ "Subject: programming_paradigms | Type: preference | Observation: The user prefers functional programming over OOP | Quote: I prefer functional programming over OOP",
+ ),
+ (0.7454, "I prefer functional programming over OOP"),
+ (
+ 0.6541,
+ "Subject: programming_philosophy | Type: belief | Observation: The user believes functional programming leads to better code quality | Quote: Functional programming produces more maintainable code",
+ ),
+ ],
+ "temporal": [
+ (
+ 0.6188,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8",
+ ),
+ (
+ 0.5902,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8",
+ ),
+ (
+ 0.5144,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8",
+ ),
+ (
+ 0.4989,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: refactoring | Observation: The user always refactors to pure functions | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What are the user's beliefs about code quality?": {
+ "semantic": [
+ (0.6925, "The user believes code reviews are essential for quality"),
+ (
+ 0.68,
+ "The user believes functional programming leads to better code quality",
+ ),
+ (
+ 0.6524,
+ "Subject: code_quality | Type: belief | Observation: The user believes code reviews are essential for quality | Quote: Code reviews catch bugs that automated testing misses",
+ ),
+ (
+ 0.6466,
+ "Subject: programming_philosophy | Type: belief | Observation: The user believes functional programming leads to better code quality | Quote: Functional programming produces more maintainable code",
+ ),
+ ],
+ "temporal": [
+ (
+ 0.5544,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: code_quality | Observation: The user believes code reviews are essential for quality | Confidence: 0.8",
+ ),
+ (
+ 0.5397,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8",
+ ),
+ (
+ 0.4931,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8",
+ ),
+ (
+ 0.4674,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8",
+ ),
+ ],
+ },
+ "How does the user approach debugging code?": {
+ "semantic": [
+ (
+ 0.7011,
+ "Subject: debugging_approach | Type: behavior | Observation: The user debugs by adding print statements rather than using a debugger | Quote: When debugging, I just add console.log everywhere",
+ ),
+ (
+ 0.6962,
+ "The user debugs by adding print statements rather than using a debugger",
+ ),
+ (0.6788, "When debugging, I just add console.log everywhere"),
+ (
+ 0.5357,
+ "Subject: code_quality | Type: belief | Observation: The user believes code reviews are essential for quality | Quote: Code reviews catch bugs that automated testing misses",
+ ),
+ ],
+ "temporal": [
+ (
+ 0.6252,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: debugging_approach | Observation: The user debugs by adding print statements rather than using a debugger | Confidence: 0.8",
+ ),
+ (
+ 0.476,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8",
+ ),
+ (
+ 0.4424,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches | Confidence: 0.8",
+ ),
+ (
+ 0.4402,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What are the user's git and version control habits?": {
+ "semantic": [
+ (
+ 0.6474,
+ "Subject: version_control_style | Type: preference | Observation: The user prefers small, focused commits over large feature branches | Quote: I like to commit small, logical changes frequently",
+ ),
+ (0.6424, "I like to commit small, logical changes frequently"),
+ (
+ 0.5961,
+ "The user prefers small, focused commits over large feature branches",
+ ),
+ (
+ 0.5806,
+ "Subject: git_habits | Type: behavior | Observation: The user writes commit messages in present tense | Quote: Fix bug in parser instead of Fixed bug in parser",
+ ),
+ ],
+ "temporal": [
+ (
+ 0.6174,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches | Confidence: 0.8",
+ ),
+ (
+ 0.5733,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: git_habits | Observation: The user writes commit messages in present tense | Confidence: 0.8",
+ ),
+ (
+ 0.4848,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8",
+ ),
+ (
+ 0.4604,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8",
+ ),
+ ],
+ },
+ "When does the user prefer to work?": {
+ "semantic": [
+ (0.6806, "The user prefers working late at night"),
+ (
+ 0.6792,
+ "Subject: work_schedule | Type: behavior | Observation: The user prefers working late at night | Quote: I do my best coding between 10pm and 2am",
+ ),
+ (0.6439, "I do my best coding between 10pm and 2am"),
+ (0.5528, "I use 25-minute work intervals with 5-minute breaks"),
+ ],
+ "temporal": [
+ (
+ 0.7023,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: work_schedule | Observation: The user prefers working late at night | Confidence: 0.8",
+ ),
+ (
+ 0.6395,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8",
+ ),
+ (
+ 0.6375,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8",
+ ),
+ (
+ 0.6254,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8",
+ ),
+ ],
+ },
+ "How does the user handle productivity and time management?": {
+ "semantic": [
+ (
+ 0.579,
+ "Subject: productivity_methods | Type: behavior | Observation: The user takes breaks every 25 minutes using the Pomodoro technique | Quote: I use 25-minute work intervals with 5-minute breaks",
+ ),
+ (0.5731, "I use 25-minute work intervals with 5-minute breaks"),
+ (
+ 0.5284,
+ "The user takes breaks every 25 minutes using the Pomodoro technique",
+ ),
+ (0.5153, "I do my best coding between 10pm and 2am"),
+ ],
+ "temporal": [
+ (
+ 0.5705,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: productivity_methods | Observation: The user takes breaks every 25 minutes using the Pomodoro technique | Confidence: 0.8",
+ ),
+ (
+ 0.5023,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8",
+ ),
+ (
+ 0.4631,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: work_schedule | Observation: The user prefers working late at night | Confidence: 0.8",
+ ),
+ (
+ 0.4626,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: documentation_habits | Observation: The user always writes documentation before implementing features | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What editor does the user prefer?": {
+ "semantic": [
+ (
+ 0.6394,
+ "Subject: editor_preference | Type: preference | Observation: The user prefers Vim over VS Code for editing | Quote: Vim makes me more productive than any modern editor",
+ ),
+ (0.6241, "The user prefers Vim over VS Code for editing"),
+ (0.5528, "Vim makes me more productive than any modern editor"),
+ (0.4887, "The user claims to prefer tabs but their code uses spaces"),
+ ],
+ "temporal": [
+ (
+ 0.5701,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8",
+ ),
+ (
+ 0.4557,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8",
+ ),
+ (
+ 0.4322,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8",
+ ),
+ (
+ 0.4283,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: database_preference | Observation: The user prefers PostgreSQL over MongoDB for most applications | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What databases does the user like to use?": {
+ "semantic": [
+ (
+ 0.6328,
+ "Subject: database_preference | Type: preference | Observation: The user prefers PostgreSQL over MongoDB for most applications | Quote: Relational databases handle complex queries better than document stores",
+ ),
+ (0.5992, "The user prefers PostgreSQL over MongoDB for most applications"),
+ (
+ 0.5352,
+ "Subject: domain_preference | Type: preference | Observation: The user prefers working on backend systems over frontend UI | Quote: I find backend logic more interesting than UI work",
+ ),
+ (0.5186, "The user prefers working on backend systems over frontend UI"),
+ ],
+ "temporal": [
+ (
+ 0.5599,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: database_preference | Observation: The user prefers PostgreSQL over MongoDB for most applications | Confidence: 0.8",
+ ),
+ (
+ 0.4617,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8",
+ ),
+ (
+ 0.4445,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8",
+ ),
+ (
+ 0.4365,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What programming languages does the user work with?": {
+ "semantic": [
+ (0.7255, "The user primarily works with Python and JavaScript"),
+ (0.6954, "Most of my work is in Python backend and React frontend"),
+ (
+ 0.6874,
+ "Subject: primary_languages | Type: general | Observation: The user primarily works with Python and JavaScript | Quote: Most of my work is in Python backend and React frontend",
+ ),
+ (0.6098, "I'm picking up Rust on weekends"),
+ ],
+ "temporal": [
+ (
+ 0.5939,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8",
+ ),
+ (
+ 0.4679,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience | Confidence: 0.8",
+ ),
+ (
+ 0.4623,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: learning_activities | Observation: The user is currently learning Rust in their spare time | Confidence: 0.8",
+ ),
+ (
+ 0.4514,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What is the user's programming experience level?": {
+ "semantic": [
+ (0.6664, "The user has 8 years of professional programming experience"),
+ (
+ 0.6565,
+ "Subject: experience_level | Type: general | Observation: The user has 8 years of professional programming experience | Quote: I've been coding professionally for 8 years",
+ ),
+ (0.5949, "I've been coding professionally for 8 years"),
+ (0.5641, "The user is currently learning Rust in their spare time"),
+ ],
+ "temporal": [
+ (
+ 0.5991,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience | Confidence: 0.8",
+ ),
+ (
+ 0.5041,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8",
+ ),
+ (
+ 0.4917,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8",
+ ),
+ (
+ 0.4817,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8",
+ ),
+ ],
+ },
+ "Where did the user study computer science?": {
+ "semantic": [
+ (0.6863, "I studied CS at Stanford"),
+ (0.649, "The user graduated with a Computer Science degree from Stanford"),
+ (
+ 0.6344,
+ "Subject: education_background | Type: general | Observation: The user graduated with a Computer Science degree from Stanford | Quote: I studied CS at Stanford",
+ ),
+ (0.4592, "The user is currently learning Rust in their spare time"),
+ ],
+ "temporal": [
+ (
+ 0.5455,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: education_background | Observation: The user graduated with a Computer Science degree from Stanford | Confidence: 0.8",
+ ),
+ (
+ 0.3842,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience | Confidence: 0.8",
+ ),
+ (
+ 0.3792,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8",
+ ),
+ (
+ 0.3781,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: learning_activities | Observation: The user is currently learning Rust in their spare time | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What kind of company does the user work at?": {
+ "semantic": [
+ (0.6308, "The user works at a mid-size startup with 50 employees"),
+ (
+ 0.5371,
+ "Subject: company_size | Type: general | Observation: The user works at a mid-size startup with 50 employees | Quote: Our company has about 50 people",
+ ),
+ (0.5253, "Most of my work is in Python backend and React frontend"),
+ (0.4902, "I've been coding professionally for 8 years"),
+ ],
+ "temporal": [
+ (
+ 0.5309,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: company_size | Observation: The user works at a mid-size startup with 50 employees | Confidence: 0.8",
+ ),
+ (
+ 0.4329,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8",
+ ),
+ (
+ 0.4323,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: education_background | Observation: The user graduated with a Computer Science degree from Stanford | Confidence: 0.8",
+ ),
+ (
+ 0.419,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: work_schedule | Observation: The user prefers working late at night | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What does the user think about AI replacing programmers?": {
+ "semantic": [
+ (
+ 0.5965,
+ "Subject: ai_future | Type: belief | Observation: The user thinks AI will replace most software developers within 10 years | Quote: AI will make most programmers obsolete by 2035",
+ ),
+ (
+ 0.572,
+ "The user thinks AI will replace most software developers within 10 years",
+ ),
+ (0.5715, "AI will make most programmers obsolete by 2035"),
+ (
+ 0.4344,
+ "The user believes functional programming leads to better code quality",
+ ),
+ ],
+ "temporal": [
+ (
+ 0.4629,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: ai_future | Observation: The user thinks AI will replace most software developers within 10 years | Confidence: 0.8",
+ ),
+ (
+ 0.362,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8",
+ ),
+ (
+ 0.3308,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8",
+ ),
+ (
+ 0.328,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What are the user's views on artificial intelligence?": {
+ "semantic": [
+ (
+ 0.5885,
+ "Subject: ai_future | Type: belief | Observation: The user thinks AI will replace most software developers within 10 years | Quote: AI will make most programmers obsolete by 2035",
+ ),
+ (
+ 0.5661,
+ "The user thinks AI will replace most software developers within 10 years",
+ ),
+ (0.5133, "AI will make most programmers obsolete by 2035"),
+ (0.4927, "I find backend logic more interesting than UI work"),
+ ],
+ "temporal": [
+ (
+ 0.5399,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: ai_future | Observation: The user thinks AI will replace most software developers within 10 years | Confidence: 0.8",
+ ),
+ (
+ 0.4353,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8",
+ ),
+ (
+ 0.4223,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: humans | Observation: The user thinks that all men must die. | Confidence: 0.8",
+ ),
+ (
+ 0.4219,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8",
+ ),
+ ],
+ },
+ "Has the user changed their mind about TypeScript?": {
+ "semantic": [
+ (
+ 0.6174,
+ "The user now says they love TypeScript but previously called it verbose",
+ ),
+ (
+ 0.5757,
+ "Subject: typescript_opinion | Type: contradiction | Observation: The user now says they love TypeScript but previously called it verbose | Quote: TypeScript has too much boilerplate vs TypeScript makes my code so much cleaner",
+ ),
+ (
+ 0.4924,
+ "TypeScript has too much boilerplate vs TypeScript makes my code so much cleaner",
+ ),
+ (0.4157, "The user always refactors to pure functions"),
+ ],
+ "temporal": [
+ (
+ 0.5631,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8",
+ ),
+ (
+ 0.4016,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8",
+ ),
+ (
+ 0.3827,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8",
+ ),
+ (
+ 0.3825,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8",
+ ),
+ ],
+ },
+ "Are there any contradictions in the user's preferences?": {
+ "semantic": [
+ (0.536, "The user claims to prefer tabs but their code uses spaces"),
+ (
+ 0.5353,
+ "Subject: indentation_preference | Type: contradiction | Observation: The user claims to prefer tabs but their code uses spaces | Quote: Tabs are better than spaces vs code consistently uses 2-space indentation",
+ ),
+ (
+ 0.5321,
+ "Subject: pure_functions | Type: contradiction | Observation: The user said pure functions are yucky | Quote: Pure functions are yucky",
+ ),
+ (
+ 0.5058,
+ "Subject: typescript_opinion | Type: contradiction | Observation: The user now says they love TypeScript but previously called it verbose | Quote: TypeScript has too much boilerplate vs TypeScript makes my code so much cleaner",
+ ),
+ ],
+ "temporal": [
+ (
+ 0.4763,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8",
+ ),
+ (
+ 0.4693,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8",
+ ),
+ (
+ 0.4681,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8",
+ ),
+ (
+ 0.4586,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What does the user think about software testing?": {
+ "semantic": [
+ (
+ 0.6386,
+ "Subject: testing_philosophy | Type: belief | Observation: The user believes unit tests are a waste of time for prototypes | Quote: Writing tests for throwaway code slows development",
+ ),
+ (0.6222, "The user believes unit tests are a waste of time for prototypes"),
+ (
+ 0.6152,
+ "Subject: code_quality | Type: belief | Observation: The user believes code reviews are essential for quality | Quote: Code reviews catch bugs that automated testing misses",
+ ),
+ (0.6036, "The user believes code reviews are essential for quality"),
+ ],
+ "temporal": [
+ (
+ 0.5881,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8",
+ ),
+ (
+ 0.5074,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: code_quality | Observation: The user believes code reviews are essential for quality | Confidence: 0.8",
+ ),
+ (
+ 0.4863,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8",
+ ),
+ (
+ 0.4748,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: debugging_approach | Observation: The user debugs by adding print statements rather than using a debugger | Confidence: 0.8",
+ ),
+ ],
+ },
+ "How does the user approach documentation?": {
+ "semantic": [
+ (
+ 0.5966,
+ "Subject: documentation_habits | Type: behavior | Observation: The user always writes documentation before implementing features | Quote: I document the API design before writing any code",
+ ),
+ (
+ 0.5473,
+ "The user always writes documentation before implementing features",
+ ),
+ (0.5207, "I document the API design before writing any code"),
+ (
+ 0.4954,
+ "Subject: debugging_approach | Type: behavior | Observation: The user debugs by adding print statements rather than using a debugger | Quote: When debugging, I just add console.log everywhere",
+ ),
+ ],
+ "temporal": [
+ (
+ 0.4988,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: documentation_habits | Observation: The user always writes documentation before implementing features | Confidence: 0.8",
+ ),
+ (
+ 0.4335,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8",
+ ),
+ (
+ 0.4316,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: debugging_approach | Observation: The user debugs by adding print statements rather than using a debugger | Confidence: 0.8",
+ ),
+ (
+ 0.4307,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What are the user's collaboration preferences?": {
+ "semantic": [
+ (
+ 0.651,
+ "Subject: collaboration_preference | Type: preference | Observation: The user prefers pair programming for complex problems | Quote: Two heads are better than one when solving hard problems",
+ ),
+ (0.5848, "The user prefers pair programming for complex problems"),
+ (
+ 0.5355,
+ "Subject: version_control_style | Type: preference | Observation: The user prefers small, focused commits over large feature branches | Quote: I like to commit small, logical changes frequently",
+ ),
+ (
+ 0.5216,
+ "Subject: domain_preference | Type: preference | Observation: The user prefers working on backend systems over frontend UI | Quote: I find backend logic more interesting than UI work",
+ ),
+ ],
+ "temporal": [
+ (
+ 0.6027,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8",
+ ),
+ (
+ 0.5101,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches | Confidence: 0.8",
+ ),
+ (
+ 0.482,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8",
+ ),
+ (
+ 0.4782,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What does the user think about remote work?": {
+ "semantic": [
+ (0.7063, "The user thinks remote work is more productive than office work"),
+ (
+ 0.6583,
+ "Subject: work_environment | Type: belief | Observation: The user thinks remote work is more productive than office work | Quote: I get more done working from home",
+ ),
+ (0.6032, "I get more done working from home"),
+ (0.4997, "The user prefers working on backend systems over frontend UI"),
+ ],
+ "temporal": [
+ (
+ 0.5934,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8",
+ ),
+ (
+ 0.4173,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: work_schedule | Observation: The user prefers working late at night | Confidence: 0.8",
+ ),
+ (
+ 0.4148,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8",
+ ),
+ (
+ 0.4121,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What are the user's productivity methods?": {
+ "semantic": [
+ (
+ 0.5723,
+ "Subject: productivity_methods | Type: behavior | Observation: The user takes breaks every 25 minutes using the Pomodoro technique | Quote: I use 25-minute work intervals with 5-minute breaks",
+ ),
+ (
+ 0.5261,
+ "The user takes breaks every 25 minutes using the Pomodoro technique",
+ ),
+ (0.5205, "I use 25-minute work intervals with 5-minute breaks"),
+ (0.5107, "The user thinks remote work is more productive than office work"),
+ ],
+ "temporal": [
+ (
+ 0.5427,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: productivity_methods | Observation: The user takes breaks every 25 minutes using the Pomodoro technique | Confidence: 0.8",
+ ),
+ (
+ 0.4743,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8",
+ ),
+ (
+ 0.4299,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8",
+ ),
+ (
+ 0.4227,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What technical skills is the user learning?": {
+ "semantic": [
+ (0.5765, "The user is currently learning Rust in their spare time"),
+ (
+ 0.5502,
+ "Subject: learning_activities | Type: general | Observation: The user is currently learning Rust in their spare time | Quote: I'm picking up Rust on weekends",
+ ),
+ (0.5411, "I'm picking up Rust on weekends"),
+ (0.5155, "The user primarily works with Python and JavaScript"),
+ ],
+ "temporal": [
+ (
+ 0.5301,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: learning_activities | Observation: The user is currently learning Rust in their spare time | Confidence: 0.8",
+ ),
+ (
+ 0.4913,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8",
+ ),
+ (
+ 0.481,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience | Confidence: 0.8",
+ ),
+ (
+ 0.4558,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: education_background | Observation: The user graduated with a Computer Science degree from Stanford | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What does the user think about cooking?": {
+ "semantic": [
+ (0.4888, "I find backend logic more interesting than UI work"),
+ (0.4624, "The user prefers working on backend systems over frontend UI"),
+ (
+ 0.4551,
+ "The user believes functional programming leads to better code quality",
+ ),
+ (0.4547, "The user said pure functions are yucky"),
+ ],
+ "temporal": [
+ (
+ 0.3812,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8",
+ ),
+ (
+ 0.3773,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8",
+ ),
+ (
+ 0.3686,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8",
+ ),
+ (
+ 0.3649,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What are the user's travel preferences?": {
+ "semantic": [
+ (
+ 0.522,
+ "Subject: domain_preference | Type: preference | Observation: The user prefers working on backend systems over frontend UI | Quote: I find backend logic more interesting than UI work",
+ ),
+ (0.5145, "The user prefers functional programming over OOP"),
+ (0.5079, "The user prefers working on backend systems over frontend UI"),
+ (0.5045, "The user prefers working late at night"),
+ ],
+ "temporal": [
+ (
+ 0.4849,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8",
+ ),
+ (
+ 0.4779,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: database_preference | Observation: The user prefers PostgreSQL over MongoDB for most applications | Confidence: 0.8",
+ ),
+ (
+ 0.4659,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8",
+ ),
+ (
+ 0.4639,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8",
+ ),
+ ],
+ },
+ "What music does the user like?": {
+ "semantic": [
+ (
+ 0.4927,
+ "Subject: domain_preference | Type: preference | Observation: The user prefers working on backend systems over frontend UI | Quote: I find backend logic more interesting than UI work",
+ ),
+ (0.4906, "The user prefers working late at night"),
+ (0.4904, "The user prefers functional programming over OOP"),
+ (0.4894, "The user primarily works with Python and JavaScript"),
+ ],
+ "temporal": [
+ (
+ 0.4674,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8",
+ ),
+ (
+ 0.4548,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8",
+ ),
+ (
+ 0.4518,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8",
+ ),
+ (
+ 0.4496,
+ "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8",
+ ),
+ ],
+ },
+}
+
+
+def test_real_observation_embeddings(real_voyage_client, qdrant):
+ beliefs = [
+ ("The user thinks that all men must die.", "All humans are mortal.", "humans"),
+ (
+ "The user believes functional programming leads to better code quality",
+ "Functional programming produces more maintainable code",
+ "programming_philosophy",
+ ),
+ (
+ "The user thinks AI will replace most software developers within 10 years",
+ "AI will make most programmers obsolete by 2035",
+ "ai_future",
+ ),
+ (
+ "The user believes code reviews are essential for quality",
+ "Code reviews catch bugs that automated testing misses",
+ "code_quality",
+ ),
+ (
+ "The user thinks remote work is more productive than office work",
+ "I get more done working from home",
+ "work_environment",
+ ),
+ (
+ "The user believes unit tests are a waste of time for prototypes",
+ "Writing tests for throwaway code slows development",
+ "testing_philosophy",
+ ),
+ ]
+
+ behaviors = [
+ (
+ "The user always refactors to pure functions",
+ "I always refactor to pure functions",
+ "refactoring",
+ ),
+ (
+ "The user writes commit messages in present tense",
+ "Fix bug in parser instead of Fixed bug in parser",
+ "git_habits",
+ ),
+ (
+ "The user prefers working late at night",
+ "I do my best coding between 10pm and 2am",
+ "work_schedule",
+ ),
+ (
+ "The user always writes documentation before implementing features",
+ "I document the API design before writing any code",
+ "documentation_habits",
+ ),
+ (
+ "The user debugs by adding print statements rather than using a debugger",
+ "When debugging, I just add console.log everywhere",
+ "debugging_approach",
+ ),
+ (
+ "The user takes breaks every 25 minutes using the Pomodoro technique",
+ "I use 25-minute work intervals with 5-minute breaks",
+ "productivity_methods",
+ ),
+ ]
+
+ contradictions = [
+ (
+ "The user said pure functions are yucky",
+ "Pure functions are yucky",
+ "pure_functions",
+ ),
+ (
+ "The user now says they love TypeScript but previously called it verbose",
+ "TypeScript has too much boilerplate vs TypeScript makes my code so much cleaner",
+ "typescript_opinion",
+ ),
+ (
+ "The user claims to prefer tabs but their code uses spaces",
+ "Tabs are better than spaces vs code consistently uses 2-space indentation",
+ "indentation_preference",
+ ),
+ ]
+
+ preferences = [
+ (
+ "The user prefers functional programming over OOP",
+ "I prefer functional programming over OOP",
+ "programming_paradigms",
+ ),
+ (
+ "The user prefers Vim over VS Code for editing",
+ "Vim makes me more productive than any modern editor",
+ "editor_preference",
+ ),
+ (
+ "The user prefers working on backend systems over frontend UI",
+ "I find backend logic more interesting than UI work",
+ "domain_preference",
+ ),
+ (
+ "The user prefers small, focused commits over large feature branches",
+ "I like to commit small, logical changes frequently",
+ "version_control_style",
+ ),
+ (
+ "The user prefers PostgreSQL over MongoDB for most applications",
+ "Relational databases handle complex queries better than document stores",
+ "database_preference",
+ ),
+ (
+ "The user prefers pair programming for complex problems",
+ "Two heads are better than one when solving hard problems",
+ "collaboration_preference",
+ ),
+ ]
+
+ general = [
+ ("The user is a human", "The user is a human", "humans"),
+ (
+ "The user has 8 years of professional programming experience",
+ "I've been coding professionally for 8 years",
+ "experience_level",
+ ),
+ (
+ "The user primarily works with Python and JavaScript",
+ "Most of my work is in Python backend and React frontend",
+ "primary_languages",
+ ),
+ (
+ "The user works at a mid-size startup with 50 employees",
+ "Our company has about 50 people",
+ "company_size",
+ ),
+ (
+ "The user graduated with a Computer Science degree from Stanford",
+ "I studied CS at Stanford",
+ "education_background",
+ ),
+ (
+ "The user is currently learning Rust in their spare time",
+ "I'm picking up Rust on weekends",
+ "learning_activities",
+ ),
+ ]
+
+ ids = itertools.count(1)
+ items = [
+ AgentObservation(
+ id=next(ids),
+ content=content,
+ mime_type="text/html",
+ modality="observation",
+ sha256=hashlib.sha256(content.encode("utf-8")).hexdigest(),
+ size=len(content),
+ tags=["bla"],
+ observation_type=observation_type,
+ subject=subject,
+ confidence=0.8,
+ evidence={
+ "quote": quote,
+ "source": "https://en.wikipedia.org/wiki/Human",
+ },
+ agent_model="gpt-4o",
+ inserted_at=datetime(2025, 1, 1, 12, 0, 0),
+ embed_status="QUEUED",
+ )
+ for observation_type, observations in [
+ ("belief", beliefs),
+ ("behavior", behaviors),
+ ("contradiction", contradictions),
+ ("preference", preferences),
+ ("general", general),
+ ]
+ for content, quote, subject in observations
+ ]
+
+ for item in items:
+ embed_source_item(item)
+ push_to_qdrant(items)
+
+ chunk_map = {str(c.id): c for item in items for c in item.chunks}
+
+ def get_top(vector, search_type: str) -> list[tuple[float, str]]:
+ results = qdrant_tools.search_vectors(qdrant, search_type, vector)
+ return [
+ (round(i.score, 4), chunk_map[str(i.id)].content)
+ for i in sorted(results, key=lambda x: x.score, reverse=True)
+ ][:4]
+
+ for query, expected in EXPECTED_OBSERVATION_RESULTS.items():
+ search_vector = embed_text(
+ [extract.DataChunk(data=[query])], input_type="query"
+ )[0]
+ semantic_results = get_top(search_vector, "semantic")
+ temporal_results = get_top(search_vector, "temporal")
+ assert semantic_results == expected["semantic"]
+ assert temporal_results == expected["temporal"]
diff --git a/tests/memory/common/db/models/test_source_item.py b/tests/memory/common/db/models/test_source_item.py
index 5e12a6c..3759ce2 100644
--- a/tests/memory/common/db/models/test_source_item.py
+++ b/tests/memory/common/db/models/test_source_item.py
@@ -1,23 +1,17 @@
from sqlalchemy.orm import Session
-from unittest.mock import patch, Mock
+from unittest.mock import patch
from typing import cast
import pytest
from PIL import Image
-from datetime import datetime
from memory.common import settings, chunker, extract
-from memory.common.db.models.sources import Book
from memory.common.db.models.source_items import (
Chunk,
MailMessage,
- EmailAttachment,
- BookSection,
- BlogPost,
)
from memory.common.db.models.source_item import (
SourceItem,
image_filenames,
add_pics,
- merge_metadata,
clean_filename,
)
@@ -56,114 +50,6 @@ def test_clean_filename(input_filename, expected):
assert clean_filename(input_filename) == expected
-@pytest.mark.parametrize(
- "dicts,expected",
- [
- # Empty input
- ([], {}),
- # Single dict without tags
- ([{"key": "value"}], {"key": "value"}),
- # Single dict with tags as list
- (
- [{"key": "value", "tags": ["tag1", "tag2"]}],
- {"key": "value", "tags": {"tag1", "tag2"}},
- ),
- # Single dict with tags as set
- (
- [{"key": "value", "tags": {"tag1", "tag2"}}],
- {"key": "value", "tags": {"tag1", "tag2"}},
- ),
- # Multiple dicts without tags
- (
- [{"key1": "value1"}, {"key2": "value2"}],
- {"key1": "value1", "key2": "value2"},
- ),
- # Multiple dicts with non-overlapping tags
- (
- [
- {"key1": "value1", "tags": ["tag1"]},
- {"key2": "value2", "tags": ["tag2"]},
- ],
- {"key1": "value1", "key2": "value2", "tags": {"tag1", "tag2"}},
- ),
- # Multiple dicts with overlapping tags
- (
- [
- {"key1": "value1", "tags": ["tag1", "tag2"]},
- {"key2": "value2", "tags": ["tag2", "tag3"]},
- ],
- {"key1": "value1", "key2": "value2", "tags": {"tag1", "tag2", "tag3"}},
- ),
- # Overlapping keys - later dict wins
- (
- [
- {"key": "value1", "other": "data1"},
- {"key": "value2", "another": "data2"},
- ],
- {"key": "value2", "other": "data1", "another": "data2"},
- ),
- # Mixed tags types (list and set)
- (
- [
- {"key1": "value1", "tags": ["tag1", "tag2"]},
- {"key2": "value2", "tags": {"tag3", "tag4"}},
- ],
- {
- "key1": "value1",
- "key2": "value2",
- "tags": {"tag1", "tag2", "tag3", "tag4"},
- },
- ),
- # Empty tags
- (
- [{"key": "value", "tags": []}, {"key2": "value2", "tags": []}],
- {"key": "value", "key2": "value2"},
- ),
- # None values
- (
- [{"key1": None, "key2": "value"}, {"key3": None}],
- {"key1": None, "key2": "value", "key3": None},
- ),
- # Complex nested structures
- (
- [
- {"nested": {"inner": "value1"}, "list": [1, 2, 3], "tags": ["tag1"]},
- {"nested": {"inner": "value2"}, "list": [4, 5], "tags": ["tag2"]},
- ],
- {"nested": {"inner": "value2"}, "list": [4, 5], "tags": {"tag1", "tag2"}},
- ),
- # Boolean and numeric values
- (
- [
- {"bool": True, "int": 42, "float": 3.14, "tags": ["numeric"]},
- {"bool": False, "int": 100},
- ],
- {"bool": False, "int": 100, "float": 3.14, "tags": {"numeric"}},
- ),
- # Three or more dicts
- (
- [
- {"a": 1, "tags": ["t1"]},
- {"b": 2, "tags": ["t2", "t3"]},
- {"c": 3, "a": 10, "tags": ["t3", "t4"]},
- ],
- {"a": 10, "b": 2, "c": 3, "tags": {"t1", "t2", "t3", "t4"}},
- ),
- # Dict with only tags
- ([{"tags": ["tag1", "tag2"]}], {"tags": {"tag1", "tag2"}}),
- # Empty dicts
- ([{}, {}], {}),
- # Mix of empty and non-empty dicts
- (
- [{}, {"key": "value", "tags": ["tag"]}, {}],
- {"key": "value", "tags": {"tag"}},
- ),
- ],
-)
-def test_merge_metadata(dicts, expected):
- assert merge_metadata(*dicts) == expected
-
-
def test_image_filenames_with_existing_filenames(tmp_path):
"""Test image_filenames when images already have filenames"""
chunk_id = "test_chunk_123"
diff --git a/tests/memory/common/db/models/test_source_item_embeddings.py b/tests/memory/common/db/models/test_source_item_embeddings.py
new file mode 100644
index 0000000..4fa92dd
--- /dev/null
+++ b/tests/memory/common/db/models/test_source_item_embeddings.py
@@ -0,0 +1,626 @@
+import hashlib
+from datetime import datetime
+from typing import Sequence, cast
+from unittest.mock import ANY, Mock, call
+
+import pymupdf # PyMuPDF
+import pytest
+
+from memory.common import settings
+from memory.common.db.models.source_item import Chunk, SourceItem
+from memory.common.db.models.source_items import (
+ AgentObservation,
+ BlogPost,
+ BookSection,
+ Comic,
+ EmailAttachment,
+ ForumPost,
+ MailMessage,
+)
+from memory.common.db.models.sources import Book
+from memory.common.embedding import embed_source_item
+from memory.common.extract import page_to_image
+from tests.data.contents import (
+ CHUNKS,
+ LANG_TIMELINE_HASH,
+ SAMPLE_MARKDOWN,
+ SAMPLE_TEXT,
+ image_hash,
+)
+
+
+def compare_chunks(
+ chunks: Sequence[Chunk],
+ expected: Sequence[tuple[str | None, list[str], dict]],
+):
+ data = [
+ (c.content, [image_hash(i) for i in c.images], c.item_metadata) for c in chunks
+ ]
+ assert data == expected
+
+
+def test_base_source_item_text_embeddings(mock_voyage_client):
+ item = SourceItem(
+ id=1,
+ content=SAMPLE_MARKDOWN,
+ mime_type="text/html",
+ modality="text",
+ sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
+ size=len(SAMPLE_MARKDOWN),
+ tags=["bla"],
+ )
+ metadata = item.as_payload()
+ metadata["tags"] = {"bla"}
+ expected = [
+ (CHUNKS[0].strip(), cast(list[str], []), metadata),
+ (CHUNKS[1].strip(), cast(list[str], []), metadata),
+ ("test summary", [], metadata | {"tags": {"tag1", "tag2", "bla"}}),
+ ]
+
+ mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3))
+ mock_voyage_client.multimodal_embed = Mock(
+ return_value=Mock(embeddings=[[0.1] * 1024] * 3)
+ )
+ compare_chunks(item.data_chunks(), expected)
+ compare_chunks(embed_source_item(item), expected)
+
+ assert mock_voyage_client.embed.call_count == 1
+ assert not mock_voyage_client.multimodal_embed.call_count
+
+ assert mock_voyage_client.embed.call_args == call(
+ [CHUNKS[0].strip(), CHUNKS[1].strip(), "test summary"],
+ model=settings.TEXT_EMBEDDING_MODEL,
+ input_type="document",
+ )
+
+
+def test_base_source_item_mixed_embeddings(mock_voyage_client):
+ item = SourceItem(
+ id=1,
+ content=SAMPLE_MARKDOWN,
+ filename=DATA_DIR / "lang_timeline.png",
+ mime_type="image/png",
+ modality="photo",
+ sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
+ size=len(SAMPLE_MARKDOWN),
+ tags=["bla"],
+ )
+ metadata = item.as_payload()
+ metadata["tags"] = {"bla"}
+ expected = [
+ (CHUNKS[0].strip(), [], metadata),
+ (CHUNKS[1].strip(), [], metadata),
+ ("test summary", [], metadata | {"tags": {"tag1", "tag2", "bla"}}),
+ (None, [LANG_TIMELINE_HASH], {"size": 3465, "source_id": 1, "tags": {"bla"}}),
+ ]
+
+ mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3))
+ mock_voyage_client.multimodal_embed = Mock(
+ return_value=Mock(embeddings=[[0.1] * 1024] * 3)
+ )
+ compare_chunks(item.data_chunks(), expected)
+ compare_chunks(embed_source_item(item), expected)
+
+ assert mock_voyage_client.embed.call_count == 1
+ assert mock_voyage_client.multimodal_embed.call_count == 1
+
+ assert mock_voyage_client.embed.call_args == call(
+ [CHUNKS[0].strip(), CHUNKS[1].strip(), "test summary"],
+ model=settings.TEXT_EMBEDDING_MODEL,
+ input_type="document",
+ )
+ assert mock_voyage_client.multimodal_embed.call_args == call(
+ [[ANY]],
+ model=settings.MIXED_EMBEDDING_MODEL,
+ input_type="document",
+ )
+ assert [
+ image_hash(i) for i in mock_voyage_client.multimodal_embed.call_args[0][0][0]
+ ] == [LANG_TIMELINE_HASH]
+
+
+def test_mail_message_embeddings(mock_voyage_client):
+ item = MailMessage(
+ id=1,
+ content=SAMPLE_MARKDOWN,
+ mime_type="text/html",
+ modality="text",
+ sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
+ size=len(SAMPLE_MARKDOWN),
+ tags=["bla"],
+ message_id="123",
+ subject="Test Subject",
+ sender="test@example.com",
+ recipients=["test@example.com"],
+ folder="INBOX",
+ sent_at=datetime(2025, 1, 1, 12, 0, 0),
+ )
+ metadata = item.as_payload()
+ metadata["tags"] = {"bla", "test@example.com"}
+ expected = [
+ (CHUNKS[0].strip(), [], metadata),
+ (CHUNKS[1].strip(), [], metadata),
+ (
+ "test summary",
+ [],
+ metadata | {"tags": {"tag1", "tag2", "bla", "test@example.com"}},
+ ),
+ ]
+
+ mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3))
+ mock_voyage_client.multimodal_embed = Mock(
+ return_value=Mock(embeddings=[[0.1] * 1024] * 3)
+ )
+ compare_chunks(item.data_chunks(), expected)
+ compare_chunks(embed_source_item(item), expected)
+
+ assert mock_voyage_client.embed.call_count == 1
+ assert not mock_voyage_client.multimodal_embed.call_count
+
+ assert mock_voyage_client.embed.call_args == call(
+ [CHUNKS[0].strip(), CHUNKS[1].strip(), "test summary"],
+ model=settings.TEXT_EMBEDDING_MODEL,
+ input_type="document",
+ )
+
+
+def test_email_attachment_embeddings_text(mock_voyage_client):
+ item = EmailAttachment(
+ id=1,
+ content=SAMPLE_MARKDOWN,
+ mime_type="text/html",
+ modality="text",
+ sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
+ size=len(SAMPLE_MARKDOWN),
+ tags=["bla"],
+ )
+ metadata = item.as_payload()
+ metadata["tags"] = {"bla"}
+ expected = [
+ (CHUNKS[0].strip(), cast(list[str], []), metadata),
+ (CHUNKS[1].strip(), cast(list[str], []), metadata),
+ (
+ "test summary",
+ [],
+ metadata | {"tags": {"tag1", "tag2", "bla"}},
+ ),
+ ]
+
+ mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3))
+ mock_voyage_client.multimodal_embed = Mock(
+ return_value=Mock(embeddings=[[0.1] * 1024] * 3)
+ )
+ compare_chunks(item.data_chunks(), expected)
+ compare_chunks(embed_source_item(item), expected)
+
+ assert mock_voyage_client.embed.call_count == 1
+ assert not mock_voyage_client.multimodal_embed.call_count
+
+ assert mock_voyage_client.embed.call_args == call(
+ [CHUNKS[0].strip(), CHUNKS[1].strip(), "test summary"],
+ model=settings.TEXT_EMBEDDING_MODEL,
+ input_type="document",
+ )
+
+
+def test_email_attachment_embeddings_photo(mock_voyage_client):
+ item = EmailAttachment(
+ id=1,
+ content=SAMPLE_MARKDOWN,
+ filename=DATA_DIR / "lang_timeline.png",
+ mime_type="image/png",
+ modality="photo",
+ sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
+ size=len(SAMPLE_MARKDOWN),
+ tags=["bla"],
+ )
+ metadata = item.as_payload()
+ metadata["tags"] = {"bla"}
+ expected = [
+ (None, [LANG_TIMELINE_HASH], metadata),
+ ]
+
+ mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3))
+ mock_voyage_client.multimodal_embed = Mock(
+ return_value=Mock(embeddings=[[0.1] * 1024] * 3)
+ )
+ compare_chunks(item.data_chunks(), expected)
+ compare_chunks(embed_source_item(item), expected)
+
+ assert mock_voyage_client.embed.call_count == 0
+ assert mock_voyage_client.multimodal_embed.call_count == 1
+
+ assert mock_voyage_client.multimodal_embed.call_args == call(
+ [[ANY]],
+ model=settings.MIXED_EMBEDDING_MODEL,
+ input_type="document",
+ )
+ assert [
+ image_hash(i) for i in mock_voyage_client.multimodal_embed.call_args[0][0][0]
+ ] == [LANG_TIMELINE_HASH]
+
+
+def test_email_attachment_embeddings_pdf(mock_voyage_client):
+ item = EmailAttachment(
+ id=1,
+ content=SAMPLE_MARKDOWN,
+ filename=DATA_DIR / "regulamin.pdf",
+ mime_type="application/pdf",
+ modality="doc",
+ sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
+ size=len(SAMPLE_MARKDOWN),
+ tags=["bla"],
+ )
+ metadata = item.as_payload()
+ metadata["tags"] = {"bla"}
+ with pymupdf.open(item.filename) as pdf:
+ expected = [
+ (
+ None,
+ [image_hash(page_to_image(page))],
+ metadata
+ | {
+ "page": page.number,
+ "width": page.rect.width,
+ "height": page.rect.height,
+ },
+ )
+ for page in pdf.pages()
+ ]
+
+ mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3))
+ mock_voyage_client.multimodal_embed = Mock(
+ return_value=Mock(embeddings=[[0.1] * 1024] * 3)
+ )
+ compare_chunks(item.data_chunks(), expected)
+ compare_chunks(embed_source_item(item), expected)
+
+ assert mock_voyage_client.embed.call_count == 0
+ assert mock_voyage_client.multimodal_embed.call_count == 1
+
+ assert mock_voyage_client.multimodal_embed.call_args == call(
+ [[ANY], [ANY]],
+ model=settings.MIXED_EMBEDDING_MODEL,
+ input_type="document",
+ )
+ assert [
+ [image_hash(a) for a in i]
+ for i in mock_voyage_client.multimodal_embed.call_args[0][0]
+ ] == [page for _, page, _ in expected]
+
+
+def test_email_attachment_embeddings_comic(mock_voyage_client):
+ item = Comic(
+ id=1,
+ content=SAMPLE_MARKDOWN,
+ filename=DATA_DIR / "lang_timeline.png",
+ mime_type="image/png",
+ modality="comic",
+ sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
+ size=len(SAMPLE_MARKDOWN),
+ tags=["bla"],
+ title="The Evolution of Programming Languages",
+ author="John Doe",
+ published=datetime(2025, 1, 1, 12, 0, 0),
+ volume="1",
+ issue="1",
+ page=1,
+ )
+ metadata = item.as_payload()
+ metadata["tags"] = {"bla"}
+ expected = [
+ (
+ "The Evolution of Programming Languages by John Doe",
+ [LANG_TIMELINE_HASH],
+ metadata,
+ ),
+ ]
+
+ mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3))
+ mock_voyage_client.multimodal_embed = Mock(
+ return_value=Mock(embeddings=[[0.1] * 1024] * 3)
+ )
+ compare_chunks(item.data_chunks(), expected)
+ compare_chunks(embed_source_item(item), expected)
+
+ assert mock_voyage_client.embed.call_count == 0
+ assert mock_voyage_client.multimodal_embed.call_count == 1
+
+ assert mock_voyage_client.multimodal_embed.call_args == call(
+ [["The Evolution of Programming Languages by John Doe", ANY]],
+ model=settings.MIXED_EMBEDDING_MODEL,
+ input_type="document",
+ )
+ assert (
+ image_hash(mock_voyage_client.multimodal_embed.call_args[0][0][0][1])
+ == LANG_TIMELINE_HASH
+ )
+
+
+def test_book_section_embeddings_single_page(mock_voyage_client):
+ item = BookSection(
+ id=1,
+ content=SAMPLE_MARKDOWN,
+ mime_type="text/html",
+ modality="text",
+ sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
+ size=len(SAMPLE_MARKDOWN),
+ tags=["bla"],
+ book_id=1,
+ section_title="The Evolution of Programming Languages",
+ section_number=1,
+ section_level=1,
+ start_page=1,
+ end_page=1,
+ pages=[SAMPLE_TEXT],
+ book=Book(
+ id=1,
+ title="Programming Languages",
+ author="John Doe",
+ published=datetime(2025, 1, 1, 12, 0, 0),
+ ),
+ )
+ metadata = item.as_payload()
+ metadata["tags"] = {"bla"}
+ expected = [
+ (CHUNKS[0].strip(), cast(list[str], []), metadata | {"type": "page"}),
+ (CHUNKS[1].strip(), cast(list[str], []), metadata | {"type": "page"}),
+ (
+ "test summary",
+ [],
+ metadata | {"tags": {"tag1", "tag2", "bla"}, "type": "summary"},
+ ),
+ ]
+
+ mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3))
+ mock_voyage_client.multimodal_embed = Mock(
+ return_value=Mock(embeddings=[[0.1] * 1024] * 3)
+ )
+ compare_chunks(item.data_chunks(), expected)
+ compare_chunks(embed_source_item(item), expected)
+
+ assert mock_voyage_client.embed.call_count == 1
+ assert not mock_voyage_client.multimodal_embed.call_count
+
+ assert mock_voyage_client.embed.call_args == call(
+ [CHUNKS[0].strip(), CHUNKS[1].strip(), "test summary"],
+ model=settings.TEXT_EMBEDDING_MODEL,
+ input_type="document",
+ )
+
+
+def test_book_section_embeddings_multiple_pages(mock_voyage_client):
+ item = BookSection(
+ id=1,
+ content=SAMPLE_MARKDOWN + "\n\n" + SECOND_PAGE,
+ mime_type="text/html",
+ modality="text",
+ sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
+ size=len(SAMPLE_MARKDOWN),
+ tags=["bla"],
+ book_id=1,
+ section_title="The Evolution of Programming Languages",
+ section_number=1,
+ section_level=1,
+ start_page=1,
+ end_page=2,
+ pages=[SAMPLE_TEXT, SECOND_PAGE_TEXT],
+ book=Book(
+ id=1,
+ title="Programming Languages",
+ author="John Doe",
+ published=datetime(2025, 1, 1, 12, 0, 0),
+ ),
+ )
+ metadata = item.as_payload()
+ metadata["tags"] = {"bla", "tag1", "tag2"}
+ expected = [
+ (item.content.strip(), cast(list[str], []), metadata | {"type": "section"}),
+ ("test summary", [], metadata | {"type": "summary"}),
+ ]
+
+ mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3))
+ mock_voyage_client.multimodal_embed = Mock(
+ return_value=Mock(embeddings=[[0.1] * 1024] * 3)
+ )
+ compare_chunks(item.data_chunks(), expected)
+ compare_chunks(embed_source_item(item), expected)
+
+ assert mock_voyage_client.embed.call_count == 1
+ assert not mock_voyage_client.multimodal_embed.call_count
+
+ assert mock_voyage_client.embed.call_args == call(
+ [item.content.strip(), "test summary"],
+ model=settings.TEXT_EMBEDDING_MODEL,
+ input_type="document",
+ )
+
+
+@pytest.mark.parametrize(
+ "class_, modality",
+ (
+ (BlogPost, "blog"),
+ (ForumPost, "forum"),
+ ),
+)
+def test_post_embeddings_single_page(mock_voyage_client, class_, modality):
+ item = class_(
+ id=1,
+ content=SAMPLE_MARKDOWN,
+ mime_type="text/html",
+ modality=modality,
+ sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
+ size=len(SAMPLE_MARKDOWN),
+ tags=["bla"],
+ images=[LANG_TIMELINE.filename, CODE_COMPLEXITY.filename], # type: ignore
+ )
+ metadata = item.as_payload()
+ metadata["tags"] = {"bla", "tag1", "tag2"}
+ expected = [
+ (item.content.strip(), [LANG_TIMELINE_HASH, CODE_COMPLEXITY_HASH], metadata),
+ ]
+
+ mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3))
+ mock_voyage_client.multimodal_embed = Mock(
+ return_value=Mock(embeddings=[[0.1] * 1024] * 3)
+ )
+ compare_chunks(item.data_chunks(), expected)
+ compare_chunks(embed_source_item(item), expected)
+
+ assert not mock_voyage_client.embed.call_count
+ assert mock_voyage_client.multimodal_embed.call_count == 1
+
+ assert mock_voyage_client.multimodal_embed.call_args == call(
+ [[item.content.strip(), ANY, ANY]],
+ model=settings.MIXED_EMBEDDING_MODEL,
+ input_type="document",
+ )
+ assert [
+ image_hash(i)
+ for i in mock_voyage_client.multimodal_embed.call_args[0][0][0][1:]
+ ] == [LANG_TIMELINE_HASH, CODE_COMPLEXITY_HASH]
+
+
+@pytest.mark.parametrize(
+ "class_, modality",
+ (
+ (BlogPost, "blog"),
+ (ForumPost, "forum"),
+ ),
+)
+def test_post_embeddings_multi_page(mock_voyage_client, class_, modality):
+ item = class_(
+ id=1,
+ content=SAMPLE_MARKDOWN + "\n\n" + SECOND_PAGE_MARKDOWN,
+ mime_type="text/html",
+ modality=modality,
+ sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
+ size=len(SAMPLE_MARKDOWN + SECOND_PAGE_MARKDOWN),
+ tags=["bla"],
+ images=[LANG_TIMELINE.filename, CODE_COMPLEXITY.filename], # type: ignore
+ )
+ metadata = item.as_payload()
+ metadata["tags"] = {"bla", "tag1", "tag2"}
+
+ all_contents = (
+ item.content.strip(),
+ [LANG_TIMELINE_HASH, CODE_COMPLEXITY_HASH],
+ metadata,
+ )
+ first_chunk = (
+ TWO_PAGE_CHUNKS[0].strip(),
+ [LANG_TIMELINE_HASH, CODE_COMPLEXITY_HASH],
+ metadata,
+ )
+ second_chunk = (TWO_PAGE_CHUNKS[1].strip(), [], metadata)
+ third_chunk = (TWO_PAGE_CHUNKS[2].strip(), [], metadata)
+ summary = ("test summary", [], metadata)
+
+ mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3))
+ mock_voyage_client.multimodal_embed = Mock(
+ return_value=Mock(embeddings=[[0.1] * 1024] * 3)
+ )
+ compare_chunks(
+ item.data_chunks(),
+ [all_contents, first_chunk, second_chunk, third_chunk, summary],
+ )
+ # embed_source_item first does text embedding, then mixed embedding
+ # so the order of chunks is different than in data_chunks()
+ compare_chunks(
+ embed_source_item(item),
+ [
+ second_chunk,
+ third_chunk,
+ summary,
+ all_contents,
+ first_chunk,
+ ],
+ )
+
+ assert mock_voyage_client.embed.call_count == 1
+ assert mock_voyage_client.multimodal_embed.call_count == 1
+
+ assert mock_voyage_client.embed.call_args == call(
+ [
+ TWO_PAGE_CHUNKS[1].strip(),
+ TWO_PAGE_CHUNKS[2].strip(),
+ "test summary",
+ ],
+ model=settings.TEXT_EMBEDDING_MODEL,
+ input_type="document",
+ )
+ assert mock_voyage_client.multimodal_embed.call_args == call(
+ [[item.content.strip(), ANY, ANY], [TWO_PAGE_CHUNKS[0].strip(), ANY, ANY]],
+ model=settings.MIXED_EMBEDDING_MODEL,
+ input_type="document",
+ )
+ assert [
+ image_hash(i)
+ for i in mock_voyage_client.multimodal_embed.call_args[0][0][0][1:]
+ ] == [LANG_TIMELINE_HASH, CODE_COMPLEXITY_HASH]
+ assert [
+ image_hash(i)
+ for i in mock_voyage_client.multimodal_embed.call_args[0][0][1][1:]
+ ] == [LANG_TIMELINE_HASH, CODE_COMPLEXITY_HASH]
+
+
+def test_agent_observation_embeddings(mock_voyage_client):
+ item = AgentObservation(
+ id=1,
+ content="The user thinks that all men must die.",
+ mime_type="text/html",
+ modality="observation",
+ sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(),
+ size=len(SAMPLE_MARKDOWN),
+ tags=["bla"],
+ observation_type="belief",
+ subject="humans",
+ confidence=0.8,
+ evidence={
+ "quote": "All humans are mortal.",
+ "source": "https://en.wikipedia.org/wiki/Human",
+ },
+ agent_model="gpt-4o",
+ inserted_at=datetime(2025, 1, 1, 12, 0, 0),
+ )
+ metadata = item.as_payload()
+ metadata["tags"] = {"bla"}
+ expected = [
+ (
+ "Subject: humans | Type: belief | Observation: The user thinks that all men must die. | Quote: All humans are mortal.",
+ [],
+ metadata | {"embedding_type": "semantic"},
+ ),
+ (
+ "Time: 12:00 on Wednesday (afternoon) | Subject: humans | Observation: The user thinks that all men must die. | Confidence: 0.8",
+ [],
+ metadata | {"embedding_type": "temporal"},
+ ),
+ (
+ "The user thinks that all men must die.",
+ [],
+ metadata | {"embedding_type": "semantic"},
+ ),
+ ("All humans are mortal.", [], metadata | {"embedding_type": "semantic"}),
+ ]
+
+ mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3))
+ mock_voyage_client.multimodal_embed = Mock(
+ return_value=Mock(embeddings=[[0.1] * 1024] * 3)
+ )
+ compare_chunks(item.data_chunks(), expected)
+ compare_chunks(embed_source_item(item), expected)
+
+ assert mock_voyage_client.embed.call_count == 1
+ assert not mock_voyage_client.multimodal_embed.call_count
+
+ assert mock_voyage_client.embed.call_args == call(
+ [
+ "Subject: humans | Type: belief | Observation: The user thinks that all men must die. | Quote: All humans are mortal.",
+ "Time: 12:00 on Wednesday (afternoon) | Subject: humans | Observation: The user thinks that all men must die. | Confidence: 0.8",
+ "The user thinks that all men must die.",
+ "All humans are mortal.",
+ ],
+ model=settings.TEXT_EMBEDDING_MODEL,
+ input_type="document",
+ )
diff --git a/tests/memory/common/db/models/test_source_items.py b/tests/memory/common/db/models/test_source_items.py
index 1a1bf55..aacd0c9 100644
--- a/tests/memory/common/db/models/test_source_items.py
+++ b/tests/memory/common/db/models/test_source_items.py
@@ -14,7 +14,6 @@ from memory.common.db.models.source_items import (
BlogPost,
AgentObservation,
)
-from memory.common.db.models.source_item import merge_metadata
@pytest.fixture
@@ -356,7 +355,8 @@ def test_book_section_data_chunks(pages, expected_chunks):
chunks = book_section.data_chunks()
expected = [
- (c, merge_metadata(book_section.as_payload(), m)) for c, m in expected_chunks
+ (c, extract.merge_metadata(book_section.as_payload(), m))
+ for c, m in expected_chunks
]
assert [(c.content, c.item_metadata) for c in chunks] == expected
for c in chunks: