diff --git a/requirements-api.txt b/requirements-api.txt index fa6732f..aba6011 100644 --- a/requirements-api.txt +++ b/requirements-api.txt @@ -3,4 +3,5 @@ uvicorn==0.29.0 python-jose==3.3.0 python-multipart==0.0.9 sqladmin -mcp==1.9.2 \ No newline at end of file +mcp==1.9.2 +bm25s[full]==0.2.13 \ No newline at end of file diff --git a/requirements-common.txt b/requirements-common.txt index dbf0da5..d07028b 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -6,5 +6,5 @@ dotenv==0.9.9 voyageai==0.3.2 qdrant-client==1.9.0 anthropic==0.18.1 - -bm25s[full]==0.2.13 \ No newline at end of file +# Pin the httpx version, as newer versions break the anthropic client +httpx==0.27.0 \ No newline at end of file diff --git a/run_celery_task.py b/run_celery_task.py index e9a13f2..146c230 100644 --- a/run_celery_task.py +++ b/run_celery_task.py @@ -38,6 +38,7 @@ from memory.workers.tasks.maintenance import ( CLEAN_COLLECTION, REINGEST_CHUNK, REINGEST_EMPTY_SOURCE_ITEMS, + REINGEST_ALL_EMPTY_SOURCE_ITEMS, REINGEST_ITEM, REINGEST_MISSING_CHUNKS, UPDATE_METADATA_FOR_ITEM, @@ -67,6 +68,7 @@ TASK_MAPPINGS = { "reingest_chunk": REINGEST_CHUNK, "reingest_item": REINGEST_ITEM, "reingest_empty_source_items": REINGEST_EMPTY_SOURCE_ITEMS, + "reingest_all_empty_source_items": REINGEST_ALL_EMPTY_SOURCE_ITEMS, "update_metadata_for_item": UPDATE_METADATA_FOR_ITEM, "update_metadata_for_source_items": UPDATE_METADATA_FOR_SOURCE_ITEMS, }, @@ -316,6 +318,13 @@ def maintenance_reingest_empty_source_items(ctx, item_type): execute_task(ctx, "maintenance", "reingest_empty_source_items", item_type=item_type) +@maintenance.command("reingest-all-empty-source-items") +@click.pass_context +def maintenance_reingest_all_empty_source_items(ctx): + """Reingest all empty source items.""" + execute_task(ctx, "maintenance", "reingest_all_empty_source_items") + + @maintenance.command("reingest-chunk") @click.option("--chunk-id", required=True, help="Chunk ID to reingest") @click.pass_context diff --git a/src/memory/api/MCP/tools.py b/src/memory/api/MCP/tools.py index 77ba76b..f68533c 100644 --- a/src/memory/api/MCP/tools.py +++ b/src/memory/api/MCP/tools.py @@ -15,14 +15,60 @@ from memory.common.db.connection import make_session from memory.common import extract from memory.common.db.models import AgentObservation -from memory.api.search import search, SearchFilters +from memory.api.search.search import search, SearchFilters from memory.common.formatters import observation from memory.workers.tasks.content_processing import process_content_item +from memory.common.collections import ALL_COLLECTIONS, OBSERVATION_COLLECTIONS logger = logging.getLogger(__name__) # Create MCP server instance -mcp = FastMCP("memory", stateless=True) +mcp = FastMCP("memory", stateless_http=True) + + +def filter_observation_source_ids( + tags: list[str] | None = None, observation_types: list[str] | None = None +): + if not tags and not observation_types: + return None + + with make_session() as session: + items_query = session.query(AgentObservation.id) + + if tags: + # Use PostgreSQL array overlap operator with proper array casting + items_query = items_query.filter( + AgentObservation.tags.op("&&")(sql_cast(tags, ARRAY(Text))), + ) + if observation_types: + items_query = items_query.filter( + AgentObservation.observation_type.in_(observation_types) + ) + source_ids = [item.id for item in items_query.all()] + + return source_ids + + +def filter_source_ids( + modalities: set[str], + tags: list[str] | None = None, +): + if not tags: + return None + + with make_session() as session: + items_query = session.query(SourceItem.id) + + if tags: + # Use PostgreSQL array overlap operator with proper array casting + items_query = items_query.filter( + SourceItem.tags.op("&&")(sql_cast(tags, ARRAY(Text))), + ) + if modalities: + items_query = items_query.filter(SourceItem.modality.in_(modalities)) + source_ids = [item.id for item in items_query.all()] + + return source_ids @mcp.tool() @@ -48,20 +94,6 @@ async def get_all_tags() -> list[str]: - Projects: "project:website-redesign" - Contexts: "context:work", "context:late-night" - Domains: "domain:finance" - - Example: - # Get all tags to ensure consistency - tags = await get_all_tags() - # Returns: ["ai-safety", "context:work", "functional-programming", - # "machine-learning", "project:thesis", ...] - - # Use to check if a topic has been discussed before - if "quantum-computing" in tags: - # Search for related observations - observations = await search_observations( - query="quantum computing", - tags=["quantum-computing"] - ) """ with make_session() as session: tags_query = session.query(func.unnest(SourceItem.tags)).distinct() @@ -93,26 +125,6 @@ async def get_all_subjects() -> list[str]: - "ai_beliefs", "ai_safety_beliefs" - "learning_preferences" - "communication_style" - - Example: - # Get all subjects to ensure consistency - subjects = await get_all_subjects() - # Returns: ["ai_safety_beliefs", "architecture_preferences", - # "programming_philosophy", "work_schedule", ...] - - # Use to check what we know about the user - if "programming_style" in subjects: - # Get all programming-related observations - observations = await search_observations( - query="programming", - subject="programming_style" - ) - - Best practices: - - Always check existing subjects before creating new ones - - Use snake_case for consistency - - Be specific but not too granular - - Group related observations under same subject """ with make_session() as session: return sorted( @@ -146,20 +158,6 @@ async def get_all_observation_types() -> list[str]: Returns: List of observation types that have actually been used in the system. - - Example: - # Check what types of observations exist - types = await get_all_observation_types() - # Returns: ["behavior", "belief", "contradiction", "preference"] - - # Use to analyze observation distribution - for obs_type in types: - observations = await search_observations( - query="", - observation_types=[obs_type], - limit=100 - ) - print(f"{obs_type}: {len(observations)} observations") """ with make_session() as session: return sorted( @@ -173,7 +171,11 @@ async def get_all_observation_types() -> list[str]: @mcp.tool() async def search_knowledge_base( - query: str, previews: bool = False, modalities: list[str] = [], limit: int = 10 + query: str, + previews: bool = False, + modalities: set[str] = set(), + tags: list[str] = [], + limit: int = 10, ) -> list[dict]: """ Search through the user's stored knowledge and content. @@ -283,14 +285,22 @@ async def search_knowledge_base( """ logger.info(f"MCP search for: {query}") + if not modalities: + modalities = set(ALL_COLLECTIONS.keys()) + modalities = set(modalities) & ALL_COLLECTIONS.keys() - OBSERVATION_COLLECTIONS + upload_data = extract.extract_text(query) results = await search( upload_data, previews=previews, modalities=modalities, limit=limit, - min_text_score=0.3, - min_multimodal_score=0.3, + min_text_score=0.4, + min_multimodal_score=0.25, + filters=SearchFilters( + tags=tags, + source_ids=filter_source_ids(tags=tags, modalities=modalities), + ), ) # Convert SearchResult objects to dictionaries for MCP @@ -456,12 +466,13 @@ async def observe( mime_type="text/plain", sha256=sha256(f"{content}{subject}{observation_type}".encode("utf-8")).digest(), inserted_at=datetime.now(timezone.utc), + modality="observation", ) try: with make_session() as session: process_content_item(observation, session) - if not observation.id: + if not cast(int | None, observation.id): raise ValueError("Observation not created") logger.info( @@ -600,24 +611,6 @@ async def search_observations( - Higher confidence observations are more reliable - Recent observations may override older ones on same topic """ - source_ids = None - if tags or observation_types: - with make_session() as session: - items_query = session.query(AgentObservation.id) - - if tags: - # Use PostgreSQL array overlap operator with proper array casting - items_query = items_query.filter( - AgentObservation.tags.op("&&")(sql_cast(tags, ARRAY(Text))), - ) - if observation_types: - items_query = items_query.filter( - AgentObservation.observation_type.in_(observation_types) - ) - source_ids = [item.id for item in items_query.all()] - if not source_ids: - return [] - semantic_text = observation.generate_semantic_text( subject=subject or "", observation_type="".join(observation_types or []), @@ -637,18 +630,24 @@ async def search_observations( extract.DataChunk(data=[temporal]), ], previews=True, - modalities=["semantic", "temporal"], + modalities={"semantic", "temporal"}, limit=limit, - min_text_score=0.8, filters=SearchFilters( subject=subject, confidence=min_confidence, tags=tags, observation_types=observation_types, - source_ids=source_ids, + source_ids=filter_observation_source_ids(tags=tags), ), + timeout=2, ) return [ - cast(dict, cast(dict, result.model_dump()).get("content")) for result in results + { + "content": r.content, + "tags": r.tags, + "created_at": r.created_at.isoformat() if r.created_at else None, + "metadata": r.metadata, + } + for r in results ] diff --git a/src/memory/api/app.py b/src/memory/api/app.py index 4f37c39..51a9d22 100644 --- a/src/memory/api/app.py +++ b/src/memory/api/app.py @@ -3,6 +3,7 @@ FastAPI application for the knowledge base. """ import contextlib +import os import pathlib import logging from typing import Annotated, Optional @@ -105,12 +106,16 @@ def get_file_by_path(path: str): return FileResponse(path=file_path, filename=file_path.name) -def main(): +def main(reload: bool = False): """Run the FastAPI server in debug mode with auto-reloading.""" import uvicorn uvicorn.run( - "memory.api.app:app", host="0.0.0.0", port=8000, reload=True, log_level="debug" + "memory.api.app:app", + host="0.0.0.0", + port=8000, + reload=reload, + log_level="debug", ) @@ -118,4 +123,4 @@ if __name__ == "__main__": from memory.common.qdrant import setup_qdrant setup_qdrant() - main() + main(os.getenv("RELOAD", "false") == "true") diff --git a/src/memory/api/search.py b/src/memory/api/search.py deleted file mode 100644 index 6b4b501..0000000 --- a/src/memory/api/search.py +++ /dev/null @@ -1,382 +0,0 @@ -""" -Search endpoints for the knowledge base API. -""" - -import asyncio -import base64 -from hashlib import sha256 -import io -import logging -from collections import defaultdict -from typing import Any, Callable, Optional, TypedDict, NotRequired - -import bm25s -import Stemmer -import qdrant_client -from PIL import Image -from pydantic import BaseModel -from qdrant_client.http import models as qdrant_models - -from memory.common import embedding, extract, qdrant, settings -from memory.common.collections import ( - ALL_COLLECTIONS, - MULTIMODAL_COLLECTIONS, - TEXT_COLLECTIONS, -) -from memory.common.db.connection import make_session -from memory.common.db.models import Chunk - -logger = logging.getLogger(__name__) - - -class AnnotatedChunk(BaseModel): - id: str - score: float - metadata: dict - preview: Optional[str | None] = None - - -class SourceData(BaseModel): - """Holds source item data to avoid SQLAlchemy session issues""" - - id: int - size: int | None - mime_type: str | None - filename: str | None - content: str | dict | None - content_length: int - - -class SearchResponse(BaseModel): - collection: str - results: list[dict] - - -class SearchResult(BaseModel): - id: int - size: int - mime_type: str - chunks: list[AnnotatedChunk] - content: Optional[str | dict] = None - filename: Optional[str] = None - - -class SearchFilters(TypedDict): - subject: NotRequired[str | None] - confidence: NotRequired[float] - tags: NotRequired[list[str] | None] - observation_types: NotRequired[list[str] | None] - source_ids: NotRequired[list[int] | None] - - -async def with_timeout( - call, timeout: int = 2 -) -> list[tuple[SourceData, AnnotatedChunk]]: - """ - Run a function with a timeout. - - Args: - call: The function to run - timeout: The timeout in seconds - """ - try: - return await asyncio.wait_for(call, timeout=timeout) - except TimeoutError: - logger.warning(f"Search timed out after {timeout}s") - return [] - except Exception as e: - logger.error(f"Search failed: {e}") - return [] - - -def annotated_chunk( - chunk: Chunk, search_result: qdrant_models.ScoredPoint, previews: bool -) -> tuple[SourceData, AnnotatedChunk]: - def serialize_item(item: bytes | str | Image.Image) -> str | None: - if not previews and not isinstance(item, str): - return None - if not previews and isinstance(item, str): - return item[:100] - - if isinstance(item, Image.Image): - buffer = io.BytesIO() - format = item.format or "PNG" - item.save(buffer, format=format) - mime_type = f"image/{format.lower()}" - return f"data:{mime_type};base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}" - elif isinstance(item, bytes): - return base64.b64encode(item).decode("utf-8") - elif isinstance(item, str): - return item - else: - raise ValueError(f"Unsupported item type: {type(item)}") - - metadata = search_result.payload or {} - metadata = { - k: v - for k, v in metadata.items() - if k not in ["content", "filename", "size", "content_type", "tags"] - } - - # Prefetch all needed source data while in session - source = chunk.source - source_data = SourceData( - id=source.id, - size=source.size, - mime_type=source.mime_type, - filename=source.filename, - content=source.display_contents, - content_length=len(source.content) if source.content else 0, - ) - - return source_data, AnnotatedChunk( - id=str(chunk.id), - score=search_result.score, - metadata=metadata, - preview=serialize_item(chunk.data[0]) if chunk.data else None, - ) - - -def group_chunks(chunks: list[tuple[SourceData, AnnotatedChunk]]) -> list[SearchResult]: - items = defaultdict(list) - source_lookup = {} - - for source, chunk in chunks: - items[source.id].append(chunk) - source_lookup[source.id] = source - - return [ - SearchResult( - id=source.id, - size=source.size or source.content_length, - mime_type=source.mime_type or "text/plain", - filename=source.filename - and source.filename.replace( - str(settings.FILE_STORAGE_DIR).lstrip("/"), "/files" - ), - content=source.content, - chunks=sorted(chunks, key=lambda x: x.score, reverse=True), - ) - for source_id, chunks in items.items() - for source in [source_lookup[source_id]] - ] - - -def query_chunks( - client: qdrant_client.QdrantClient, - upload_data: list[extract.DataChunk], - allowed_modalities: set[str], - embedder: Callable, - min_score: float = 0.0, - limit: int = 10, - filters: dict[str, Any] | None = None, -) -> dict[str, list[qdrant_models.ScoredPoint]]: - if not upload_data or not allowed_modalities: - return {} - - chunks = [chunk for chunk in upload_data if chunk.data] - if not chunks: - logger.error(f"No chunks to embed for {allowed_modalities}") - return {} - - logger.error(f"Embedding {len(chunks)} chunks for {allowed_modalities}") - for c in chunks: - logger.error(f"Chunk: {c.data}") - vectors = embedder([c.data for c in chunks], input_type="query") - - return { - collection: [ - r - for vector in vectors - for r in qdrant.search_vectors( - client=client, - collection_name=collection, - query_vector=vector, - limit=limit, - filter_params=filters, - ) - if r.score >= min_score - ] - for collection in allowed_modalities - } - - -async def search_bm25( - query: str, - modalities: list[str], - limit: int = 10, - filters: SearchFilters = SearchFilters(), -) -> list[tuple[SourceData, AnnotatedChunk]]: - with make_session() as db: - items_query = db.query(Chunk.id, Chunk.content).filter( - Chunk.collection_name.in_(modalities) - ) - if source_ids := filters.get("source_ids"): - items_query = items_query.filter(Chunk.source_id.in_(source_ids)) - items = items_query.all() - item_ids = { - sha256(item.content.lower().strip().encode("utf-8")).hexdigest(): item.id - for item in items - } - corpus = [item.content.lower().strip() for item in items] - - stemmer = Stemmer.Stemmer("english") - corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer) - retriever = bm25s.BM25() - retriever.index(corpus_tokens) - - query_tokens = bm25s.tokenize(query, stemmer=stemmer) - results, scores = retriever.retrieve( - query_tokens, k=min(limit, len(corpus)), corpus=corpus - ) - - item_scores = { - item_ids[sha256(doc.encode("utf-8")).hexdigest()]: score - for doc, score in zip(results[0], scores[0]) - } - - with make_session() as db: - chunks = db.query(Chunk).filter(Chunk.id.in_(item_scores.keys())).all() - results = [] - for chunk in chunks: - # Prefetch all needed source data while in session - source = chunk.source - source_data = SourceData( - id=source.id, - size=source.size, - mime_type=source.mime_type, - filename=source.filename, - content=source.display_contents, - content_length=len(source.content) if source.content else 0, - ) - - annotated = AnnotatedChunk( - id=str(chunk.id), - score=item_scores[chunk.id], - metadata=source.as_payload(), - preview=None, - ) - results.append((source_data, annotated)) - - return results - - -async def search_embeddings( - data: list[extract.DataChunk], - previews: Optional[bool] = False, - modalities: set[str] = set(), - limit: int = 10, - min_score: float = 0.3, - filters: SearchFilters = SearchFilters(), - multimodal: bool = False, -) -> list[tuple[SourceData, AnnotatedChunk]]: - """ - Search across knowledge base using text query and optional files. - - Parameters: - - data: List of data to search in (e.g., text, images, files) - - previews: Whether to include previews in the search results - - modalities: List of modalities to search in (e.g., "text", "photo", "doc") - - limit: Maximum number of results - - min_score: Minimum score to include in the search results - - filters: Filters to apply to the search results - - multimodal: Whether to search in multimodal collections - """ - query_filters = { - "must": [ - {"key": "confidence", "range": {"gte": filters.get("confidence", 0.5)}}, - ], - } - if tags := filters.get("tags"): - query_filters["must"] += [{"key": "tags", "match": {"any": tags}}] - if observation_types := filters.get("observation_types"): - query_filters["must"] += [ - {"key": "observation_type", "match": {"any": observation_types}} - ] - - client = qdrant.get_qdrant_client() - results = query_chunks( - client, - data, - modalities, - embedding.embed_text if not multimodal else embedding.embed_mixed, - min_score=min_score, - limit=limit, - filters=query_filters, - ) - search_results = {k: results.get(k, []) for k in modalities} - - found_chunks = { - str(r.id): r for results in search_results.values() for r in results - } - with make_session() as db: - chunks = db.query(Chunk).filter(Chunk.id.in_(found_chunks.keys())).all() - return [ - annotated_chunk(chunk, found_chunks[str(chunk.id)], previews or False) - for chunk in chunks - ] - - -async def search( - data: list[extract.DataChunk], - previews: Optional[bool] = False, - modalities: list[str] = [], - limit: int = 10, - min_text_score: float = 0.3, - min_multimodal_score: float = 0.3, - filters: SearchFilters = {}, -) -> list[SearchResult]: - """ - Search across knowledge base using text query and optional files. - - Parameters: - - query: Optional text search query - - modalities: List of modalities to search in (e.g., "text", "photo", "doc") - - files: Optional files to include in the search context - - limit: Maximum number of results per modality - - Returns: - - List of search results sorted by score - """ - allowed_modalities = set(modalities or ALL_COLLECTIONS.keys()) - - text_embeddings_results = with_timeout( - search_embeddings( - data, - previews, - allowed_modalities & TEXT_COLLECTIONS, - limit, - min_text_score, - filters, - multimodal=False, - ) - ) - multimodal_embeddings_results = with_timeout( - search_embeddings( - data, - previews, - allowed_modalities & MULTIMODAL_COLLECTIONS, - limit, - min_multimodal_score, - filters, - multimodal=True, - ) - ) - bm25_results = with_timeout( - search_bm25( - " ".join([c for chunk in data for c in chunk.data if isinstance(c, str)]), - modalities, - limit=limit, - filters=filters, - ) - ) - - results = await asyncio.gather( - text_embeddings_results, - multimodal_embeddings_results, - bm25_results, - return_exceptions=False, - ) - - results = group_chunks([c for r in results for c in r]) - return sorted(results, key=lambda x: max(c.score for c in x.chunks), reverse=True) diff --git a/src/memory/api/search/__init__.py b/src/memory/api/search/__init__.py new file mode 100644 index 0000000..f3b1cf1 --- /dev/null +++ b/src/memory/api/search/__init__.py @@ -0,0 +1,4 @@ +from .search import search +from .utils import SearchResult, SearchFilters + +__all__ = ["search", "SearchResult", "SearchFilters"] diff --git a/src/memory/api/search/bm25.py b/src/memory/api/search/bm25.py new file mode 100644 index 0000000..9acb098 --- /dev/null +++ b/src/memory/api/search/bm25.py @@ -0,0 +1,68 @@ +""" +Search endpoints for the knowledge base API. +""" + +from hashlib import sha256 +import logging + +import bm25s +import Stemmer +from memory.api.search.utils import SourceData, AnnotatedChunk, SearchFilters + +from memory.common.db.connection import make_session +from memory.common.db.models import Chunk + +logger = logging.getLogger(__name__) + + +async def search_bm25( + query: str, + modalities: set[str], + limit: int = 10, + filters: SearchFilters = SearchFilters(), +) -> list[tuple[SourceData, AnnotatedChunk]]: + with make_session() as db: + items_query = db.query(Chunk.id, Chunk.content).filter( + Chunk.collection_name.in_(modalities) + ) + if source_ids := filters.get("source_ids"): + items_query = items_query.filter(Chunk.source_id.in_(source_ids)) + items = items_query.all() + item_ids = { + sha256(item.content.lower().strip().encode("utf-8")).hexdigest(): item.id + for item in items + } + corpus = [item.content.lower().strip() for item in items] + + stemmer = Stemmer.Stemmer("english") + corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer) + retriever = bm25s.BM25() + retriever.index(corpus_tokens) + + query_tokens = bm25s.tokenize(query, stemmer=stemmer) + results, scores = retriever.retrieve( + query_tokens, k=min(limit, len(corpus)), corpus=corpus + ) + + item_scores = { + item_ids[sha256(doc.encode("utf-8")).hexdigest()]: score + for doc, score in zip(results[0], scores[0]) + } + + with make_session() as db: + chunks = db.query(Chunk).filter(Chunk.id.in_(item_scores.keys())).all() + results = [] + for chunk in chunks: + # Prefetch all needed source data while in session + source_data = SourceData.from_chunk(chunk) + + annotated = AnnotatedChunk( + id=str(chunk.id), + score=item_scores[chunk.id], + metadata=chunk.source.as_payload(), + preview=None, + search_method="bm25", + ) + results.append((source_data, annotated)) + + return results diff --git a/src/memory/api/search/embeddings.py b/src/memory/api/search/embeddings.py new file mode 100644 index 0000000..132b6bb --- /dev/null +++ b/src/memory/api/search/embeddings.py @@ -0,0 +1,144 @@ +import base64 +import io +import logging +from typing import Any, Callable, Optional + +import qdrant_client +from PIL import Image +from qdrant_client.http import models as qdrant_models + +from memory.common import embedding, extract, qdrant +from memory.common.db.connection import make_session +from memory.common.db.models import Chunk +from memory.api.search.utils import SourceData, AnnotatedChunk, SearchFilters + +logger = logging.getLogger(__name__) + + +def annotated_chunk( + chunk: Chunk, search_result: qdrant_models.ScoredPoint, previews: bool +) -> tuple[SourceData, AnnotatedChunk]: + def serialize_item(item: bytes | str | Image.Image) -> str | None: + if not previews and not isinstance(item, str): + return None + if not previews and isinstance(item, str): + return item[:100] + + if isinstance(item, Image.Image): + buffer = io.BytesIO() + format = item.format or "PNG" + item.save(buffer, format=format) + mime_type = f"image/{format.lower()}" + return f"data:{mime_type};base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}" + elif isinstance(item, bytes): + return base64.b64encode(item).decode("utf-8") + elif isinstance(item, str): + return item + else: + raise ValueError(f"Unsupported item type: {type(item)}") + + metadata = search_result.payload or {} + metadata = { + k: v + for k, v in metadata.items() + if k not in ["content", "filename", "size", "content_type", "tags"] + } + + # Prefetch all needed source data while in session + return SourceData.from_chunk(chunk), AnnotatedChunk( + id=str(chunk.id), + score=search_result.score, + metadata=metadata, + preview=serialize_item(chunk.data[0]) if chunk.data else None, + search_method="embeddings", + ) + + +def query_chunks( + client: qdrant_client.QdrantClient, + upload_data: list[extract.DataChunk], + allowed_modalities: set[str], + embedder: Callable, + min_score: float = 0.3, + limit: int = 10, + filters: dict[str, Any] | None = None, +) -> dict[str, list[qdrant_models.ScoredPoint]]: + if not upload_data or not allowed_modalities: + return {} + + chunks = [chunk for chunk in upload_data if chunk.data] + if not chunks: + logger.error(f"No chunks to embed for {allowed_modalities}") + return {} + + vectors = embedder(chunks, input_type="query") + + return { + collection: [ + r + for vector in vectors + for r in qdrant.search_vectors( + client=client, + collection_name=collection, + query_vector=vector, + limit=limit, + filter_params=filters, + ) + if r.score >= min_score + ] + for collection in allowed_modalities + } + + +async def search_embeddings( + data: list[extract.DataChunk], + previews: Optional[bool] = False, + modalities: set[str] = set(), + limit: int = 10, + min_score: float = 0.3, + filters: SearchFilters = SearchFilters(), + multimodal: bool = False, +) -> list[tuple[SourceData, AnnotatedChunk]]: + """ + Search across knowledge base using text query and optional files. + + Parameters: + - data: List of data to search in (e.g., text, images, files) + - previews: Whether to include previews in the search results + - modalities: List of modalities to search in (e.g., "text", "photo", "doc") + - limit: Maximum number of results + - min_score: Minimum score to include in the search results + - filters: Filters to apply to the search results + - multimodal: Whether to search in multimodal collections + """ + query_filters = {} + if confidence := filters.get("confidence"): + query_filters["must"] += [{"key": "confidence", "range": {"gte": confidence}}] + if tags := filters.get("tags"): + query_filters["must"] += [{"key": "tags", "match": {"any": tags}}] + if observation_types := filters.get("observation_types"): + query_filters["must"] += [ + {"key": "observation_type", "match": {"any": observation_types}} + ] + + client = qdrant.get_qdrant_client() + results = query_chunks( + client, + data, + modalities, + embedding.embed_text if not multimodal else embedding.embed_mixed, + min_score=min_score, + limit=limit, + filters=query_filters, + ) + search_results = {k: results.get(k, []) for k in modalities} + + found_chunks = { + str(r.id): r for results in search_results.values() for r in results + } + with make_session() as db: + chunks = db.query(Chunk).filter(Chunk.id.in_(found_chunks.keys())).all() + return [ + annotated_chunk(chunk, found_chunks[str(chunk.id)], previews or False) + for chunk in chunks + ] diff --git a/src/memory/api/search/search.py b/src/memory/api/search/search.py new file mode 100644 index 0000000..2e07d5d --- /dev/null +++ b/src/memory/api/search/search.py @@ -0,0 +1,94 @@ +""" +Search endpoints for the knowledge base API. +""" + +import asyncio +import logging +from typing import Optional + +from memory.api.search.embeddings import search_embeddings +from memory.api.search.bm25 import search_bm25 +from memory.api.search.utils import SearchFilters, SearchResult + +from memory.api.search.utils import group_chunks, with_timeout +from memory.common import extract +from memory.common.collections import ( + ALL_COLLECTIONS, + MULTIMODAL_COLLECTIONS, + TEXT_COLLECTIONS, +) + +logger = logging.getLogger(__name__) + + +async def search( + data: list[extract.DataChunk], + previews: Optional[bool] = False, + modalities: set[str] = set(), + limit: int = 10, + min_text_score: float = 0.4, + min_multimodal_score: float = 0.25, + filters: SearchFilters = {}, + timeout: int = 2, +) -> list[SearchResult]: + """ + Search across knowledge base using text query and optional files. + + Parameters: + - query: Optional text search query + - modalities: List of modalities to search in (e.g., "text", "photo", "doc") + - files: Optional files to include in the search context + - limit: Maximum number of results per modality + + Returns: + - List of search results sorted by score + """ + allowed_modalities = modalities & ALL_COLLECTIONS.keys() + + text_embeddings_results = with_timeout( + search_embeddings( + data, + previews, + allowed_modalities & TEXT_COLLECTIONS, + limit, + min_text_score, + filters, + multimodal=False, + ), + timeout, + ) + multimodal_embeddings_results = with_timeout( + search_embeddings( + data, + previews, + allowed_modalities & MULTIMODAL_COLLECTIONS, + limit, + min_multimodal_score, + filters, + multimodal=True, + ), + timeout, + ) + bm25_results = with_timeout( + search_bm25( + " ".join([c for chunk in data for c in chunk.data if isinstance(c, str)]), + modalities, + limit=limit, + filters=filters, + ), + timeout, + ) + + results = await asyncio.gather( + text_embeddings_results, + multimodal_embeddings_results, + bm25_results, + return_exceptions=False, + ) + text_results, multi_results, bm25_results = results + all_results = text_results + multi_results + if len(all_results) < limit: + all_results += bm25_results + + results = group_chunks(all_results, previews or False) + return sorted(results, key=lambda x: max(c.score for c in x.chunks), reverse=True) diff --git a/src/memory/api/search/utils.py b/src/memory/api/search/utils.py new file mode 100644 index 0000000..cf8fe20 --- /dev/null +++ b/src/memory/api/search/utils.py @@ -0,0 +1,134 @@ +import asyncio +from datetime import datetime +import logging +from collections import defaultdict +from typing import Optional, TypedDict, NotRequired + +from pydantic import BaseModel + +from memory.common import settings +from memory.common.db.models import Chunk + +logger = logging.getLogger(__name__) + + +class AnnotatedChunk(BaseModel): + id: str + score: float + metadata: dict + preview: Optional[str | None] = None + search_method: str | None = None + + +class SourceData(BaseModel): + """Holds source item data to avoid SQLAlchemy session issues""" + + id: int + size: int | None + mime_type: str | None + filename: str | None + content_length: int + contents: dict | None + created_at: datetime | None + + @staticmethod + def from_chunk(chunk: Chunk) -> "SourceData": + source = chunk.source + display_contents = source.display_contents or {} + return SourceData( + id=source.id, + size=source.size, + mime_type=source.mime_type, + filename=source.filename, + content_length=len(source.content) if source.content else 0, + contents=display_contents, + created_at=source.inserted_at, + ) + + +class SearchResponse(BaseModel): + collection: str + results: list[dict] + + +class SearchResult(BaseModel): + id: int + size: int + mime_type: str + chunks: list[AnnotatedChunk] + content: Optional[str | dict] = None + filename: Optional[str] = None + tags: list[str] | None = None + metadata: dict | None = None + created_at: datetime | None = None + + +class SearchFilters(TypedDict): + subject: NotRequired[str | None] + confidence: NotRequired[float] + tags: NotRequired[list[str] | None] + observation_types: NotRequired[list[str] | None] + source_ids: NotRequired[list[int] | None] + + +async def with_timeout( + call, timeout: int = 2 +) -> list[tuple[SourceData, AnnotatedChunk]]: + """ + Run a function with a timeout. + + Args: + call: The function to run + timeout: The timeout in seconds + """ + try: + return await asyncio.wait_for(call, timeout=timeout) + except TimeoutError: + logger.warning(f"Search timed out after {timeout}s") + return [] + except Exception as e: + logger.error(f"Search failed: {e}") + return [] + + +def group_chunks( + chunks: list[tuple[SourceData, AnnotatedChunk]], preview: bool = False +) -> list[SearchResult]: + items = defaultdict(list) + source_lookup = {} + + for source, chunk in chunks: + items[source.id].append(chunk) + source_lookup[source.id] = source + + def get_content(text: str | dict | None) -> str | dict | None: + if preview or not text or not isinstance(text, str) or len(text) < 250: + return text + + return text[:250] + "..." + + def make_result(source: SourceData, chunks: list[AnnotatedChunk]) -> SearchResult: + contents = source.contents or {} + tags = contents.pop("tags", []) + content = contents.pop("content", None) + + return SearchResult( + id=source.id, + size=source.size or source.content_length, + mime_type=source.mime_type or "text/plain", + filename=source.filename + and source.filename.replace( + str(settings.FILE_STORAGE_DIR).lstrip("/"), "/files" + ), + content=get_content(content), + tags=tags, + metadata=contents, + chunks=sorted(chunks, key=lambda x: x.score, reverse=True), + created_at=source.created_at, + ) + + return [ + make_result(source, chunks) + for source_id, chunks in items.items() + for source in [source_lookup[source_id]] + ] diff --git a/src/memory/common/collections.py b/src/memory/common/collections.py index f2aee53..98fc2cc 100644 --- a/src/memory/common/collections.py +++ b/src/memory/common/collections.py @@ -102,6 +102,7 @@ TEXT_COLLECTIONS = { MULTIMODAL_COLLECTIONS = { coll for coll, params in ALL_COLLECTIONS.items() if params.get("multimodal") } +OBSERVATION_COLLECTIONS = {"semantic", "temporal"} TYPES = { "doc": ["application/pdf", "application/docx", "application/msword"], diff --git a/src/memory/common/db/models/source_item.py b/src/memory/common/db/models/source_item.py index 56e9a44..798954e 100644 --- a/src/memory/common/db/models/source_item.py +++ b/src/memory/common/db/models/source_item.py @@ -84,7 +84,7 @@ def clean_filename(filename: str) -> str: def image_filenames(chunk_id: str, images: list[Image.Image]) -> list[str]: for i, image in enumerate(images): - if not image.filename: # type: ignore + if not getattr(image, "filename", None): # type: ignore filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}_{i}.{image.format}" # type: ignore image.save(filename) image.filename = str(filename) # type: ignore @@ -100,16 +100,6 @@ def add_pics(chunk: str, images: list[Image.Image]) -> list[extract.MulitmodalCh ] -def merge_metadata(*metadata: dict[str, Any]) -> dict[str, Any]: - final = {} - for m in metadata: - data = m.copy() - if tags := set(data.pop("tags", [])): - final["tags"] = tags | final.get("tags", set()) - final |= data - return final - - def chunk_mixed(content: str, image_paths: Sequence[str]) -> list[extract.DataChunk]: if not content.strip(): return [] @@ -241,14 +231,11 @@ class SourceItem(Base): return [chunk.id for chunk in self.chunks] def _chunk_contents(self) -> Sequence[extract.DataChunk]: - chunks: list[extract.DataChunk] = [] content = cast(str | None, self.content) if content: - chunks = [extract.DataChunk(data=[c]) for c in chunker.chunk_text(content)] - - if content and len(content) > chunker.DEFAULT_CHUNK_TOKENS * 2: - summary, tags = summarizer.summarize(content) - chunks.append(extract.DataChunk(data=[summary], metadata={"tags": tags})) + chunks = extract.extract_text(content) + else: + chunks = [] mime_type = cast(str | None, self.mime_type) if mime_type and mime_type.startswith("image/"): @@ -272,12 +259,14 @@ class SourceItem(Base): file_paths=image_names, collection_name=modality, embedding_model=collections.collection_model(modality, text, images), - item_metadata=merge_metadata(self.as_payload(), data.metadata, metadata), + item_metadata=extract.merge_metadata( + self.as_payload(), data.metadata, metadata + ), ) return chunk def data_chunks(self, metadata: dict[str, Any] = {}) -> Sequence[Chunk]: - return [self._make_chunk(data) for data in self._chunk_contents()] + return [self._make_chunk(data, metadata) for data in self._chunk_contents()] def as_payload(self) -> dict: return { @@ -291,4 +280,7 @@ class SourceItem(Base): return { "tags": self.tags, "size": self.size, + "content": self.content, + "filename": self.filename, + "mime_type": self.mime_type, } diff --git a/src/memory/common/db/models/source_items.py b/src/memory/common/db/models/source_items.py index 985a2a6..1b1dac5 100644 --- a/src/memory/common/db/models/source_items.py +++ b/src/memory/common/db/models/source_items.py @@ -33,7 +33,6 @@ from memory.common.db.models.source_item import ( SourceItem, Chunk, clean_filename, - merge_metadata, chunk_mixed, ) @@ -326,27 +325,24 @@ class BookSection(SourceItem): } return {k: v for k, v in vals.items() if v} - def data_chunks(self, metadata: dict[str, Any] = {}) -> Sequence[Chunk]: + def _chunk_contents(self) -> Sequence[extract.DataChunk]: content = cast(str, self.content.strip()) if not content: return [] if len([p for p in self.pages if p.strip()]) == 1: - return [ - self._make_chunk( - extract.DataChunk(data=[content]), metadata | {"type": "page"} - ) - ] + chunks = extract.extract_text(content, metadata={"type": "page"}) + if len(chunks) > 1: + chunks[-1].metadata["type"] = "summary" + return chunks summary, tags = summarizer.summarize(content) return [ - self._make_chunk( - extract.DataChunk(data=[content]), - merge_metadata(metadata, {"type": "section", "tags": tags}), + extract.DataChunk( + data=[content], metadata={"type": "section", "tags": tags} ), - self._make_chunk( - extract.DataChunk(data=[summary]), - merge_metadata(metadata, {"type": "summary", "tags": tags}), + extract.DataChunk( + data=[summary], metadata={"type": "summary", "tags": tags} ), ] @@ -596,7 +592,7 @@ class AgentObservation(SourceItem): ) semantic_chunk = extract.DataChunk( data=[semantic_text], - metadata=merge_metadata(metadata, {"embedding_type": "semantic"}), + metadata=extract.merge_metadata(metadata, {"embedding_type": "semantic"}), modality="semantic", ) @@ -609,7 +605,7 @@ class AgentObservation(SourceItem): ) temporal_chunk = extract.DataChunk( data=[temporal_text], - metadata=merge_metadata(metadata, {"embedding_type": "temporal"}), + metadata=extract.merge_metadata(metadata, {"embedding_type": "temporal"}), modality="temporal", ) @@ -617,14 +613,14 @@ class AgentObservation(SourceItem): self._make_chunk( extract.DataChunk( data=[i], - metadata=merge_metadata(metadata, {"embedding_type": "semantic"}), + metadata=extract.merge_metadata( + metadata, {"embedding_type": "semantic"} + ), modality="semantic", ) ) for i in [ self.content, - self.subject, - self.observation_type, self.evidence.get("quote", ""), ] if i diff --git a/src/memory/common/embedding.py b/src/memory/common/embedding.py index 44ae4ea..694d04d 100644 --- a/src/memory/common/embedding.py +++ b/src/memory/common/embedding.py @@ -1,5 +1,5 @@ import logging -from typing import Iterable, Literal, cast +from typing import Literal, cast import voyageai @@ -15,12 +15,22 @@ from memory.common.db.models import Chunk, SourceItem logger = logging.getLogger(__name__) +def as_string( + chunk: extract.MulitmodalChunk | list[extract.MulitmodalChunk], +) -> str: + if isinstance(chunk, str): + return chunk.strip() + if isinstance(chunk, list): + return "\n".join(as_string(i) for i in chunk).strip() + return "" + + def embed_chunks( chunks: list[list[extract.MulitmodalChunk]], model: str = settings.TEXT_EMBEDDING_MODEL, input_type: Literal["document", "query"] = "document", ) -> list[Vector]: - logger.debug(f"Embedding chunks: {model} - {str(chunks)[:100]} {len(chunks)}") + logger.debug(f"Embedding chunks: {model} - {str(chunks)} {len(chunks)}") vo = voyageai.Client() # type: ignore if model == settings.MIXED_EMBEDDING_MODEL: return vo.multimodal_embed( @@ -29,17 +39,18 @@ def embed_chunks( input_type=input_type, ).embeddings - texts = ["\n".join(i for i in c if isinstance(i, str)) for c in chunks] + texts = [as_string(c) for c in chunks] + logger.debug(f"Embedding texts: {texts}") return cast( list[Vector], vo.embed(texts, model=model, input_type=input_type).embeddings ) def break_chunk( - chunk: list[extract.MulitmodalChunk], chunk_size: int = DEFAULT_CHUNK_TOKENS + chunk: extract.DataChunk, chunk_size: int = DEFAULT_CHUNK_TOKENS ) -> list[extract.MulitmodalChunk]: result = [] - for c in chunk: + for c in chunk.data: if isinstance(c, str): result += chunk_text(c, chunk_size, OVERLAP_TOKENS) else: @@ -48,12 +59,12 @@ def break_chunk( def embed_text( - chunks: list[list[extract.MulitmodalChunk]], + chunks: list[extract.DataChunk], model: str = settings.TEXT_EMBEDDING_MODEL, input_type: Literal["document", "query"] = "document", chunk_size: int = DEFAULT_CHUNK_TOKENS, ) -> list[Vector]: - chunked_chunks = [break_chunk(chunk, chunk_size) for chunk in chunks] + chunked_chunks = [break_chunk(chunk, chunk_size) for chunk in chunks if chunk.data] if not any(chunked_chunks): return [] @@ -61,12 +72,12 @@ def embed_text( def embed_mixed( - items: list[list[extract.MulitmodalChunk]], + items: list[extract.DataChunk], model: str = settings.MIXED_EMBEDDING_MODEL, input_type: Literal["document", "query"] = "document", chunk_size: int = DEFAULT_CHUNK_TOKENS, ) -> list[Vector]: - chunked_chunks = [break_chunk(item, chunk_size) for item in items] + chunked_chunks = [break_chunk(item, chunk_size) for item in items if item.data] return embed_chunks(chunked_chunks, model, input_type) diff --git a/src/memory/common/extract.py b/src/memory/common/extract.py index 18880eb..3df07df 100644 --- a/src/memory/common/extract.py +++ b/src/memory/common/extract.py @@ -6,7 +6,7 @@ import tempfile from contextlib import contextmanager from typing import Any, Generator, Sequence, cast -from memory.common import chunker +from memory.common import chunker, summarizer import pymupdf # PyMuPDF import pypandoc from PIL import Image @@ -16,6 +16,16 @@ logger = logging.getLogger(__name__) MulitmodalChunk = Image.Image | str +def merge_metadata(*metadata: dict[str, Any]) -> dict[str, Any]: + final = {} + for m in metadata: + data = m.copy() + if tags := set(data.pop("tags", []) or []): + final["tags"] = tags | final.get("tags", set()) + final |= data + return final + + @dataclass class DataChunk: data: Sequence[MulitmodalChunk] @@ -109,7 +119,9 @@ def extract_image(content: bytes | str | pathlib.Path) -> list[DataChunk]: def extract_text( - content: bytes | str | pathlib.Path, chunk_size: int | None = None + content: bytes | str | pathlib.Path, + chunk_size: int | None = None, + metadata: dict[str, Any] = {}, ) -> list[DataChunk]: if isinstance(content, pathlib.Path): content = content.read_text() @@ -117,8 +129,20 @@ def extract_text( content = content.decode("utf-8") content = cast(str, content) - chunks = chunker.chunk_text(content, chunk_size or chunker.DEFAULT_CHUNK_TOKENS) - return [DataChunk(data=[c], mime_type="text/plain") for c in chunks if c.strip()] + chunks = [ + DataChunk(data=[c], modality="text", metadata=metadata) + for c in chunker.chunk_text(content, chunk_size or chunker.DEFAULT_CHUNK_TOKENS) + ] + if content and len(content) > chunker.DEFAULT_CHUNK_TOKENS * 2: + summary, tags = summarizer.summarize(content) + chunks.append( + DataChunk( + data=[summary], + metadata=merge_metadata(metadata, {"tags": tags}), + modality="text", + ) + ) + return chunks def extract_data_chunks( diff --git a/src/memory/common/qdrant.py b/src/memory/common/qdrant.py index 106ee22..5e0121e 100644 --- a/src/memory/common/qdrant.py +++ b/src/memory/common/qdrant.py @@ -3,7 +3,7 @@ from typing import Any, cast, Generator, Sequence import qdrant_client from qdrant_client.http import models as qdrant_models -from qdrant_client.http.exceptions import UnexpectedResponse +from qdrant_client.http.exceptions import UnexpectedResponse, ApiException from memory.common import settings from memory.common.collections import ALL_COLLECTIONS, Collection, DistanceType, Vector @@ -193,14 +193,18 @@ def delete_points( collection_name: Name of the collection ids: List of vector IDs to delete """ - client.delete( - collection_name=collection_name, - points_selector=qdrant_models.PointIdsList( - points=ids, # type: ignore - ), - ) + try: + client.delete( + collection_name=collection_name, + points_selector=qdrant_models.PointIdsList( + points=ids, # type: ignore + ), + ) - logger.debug(f"Deleted {len(ids)} vectors from {collection_name}") + logger.debug(f"Deleted {len(ids)} vectors from {collection_name}") + except (ApiException, UnexpectedResponse) as e: + logger.error(f"Error deleting points from {collection_name}: {e}") + raise IOError(f"Error deleting points from {collection_name}: {e}") def get_collection_info( diff --git a/src/memory/common/summarizer.py b/src/memory/common/summarizer.py index 8d6fff6..981ced6 100644 --- a/src/memory/common/summarizer.py +++ b/src/memory/common/summarizer.py @@ -1,5 +1,6 @@ import json import logging +import traceback from typing import Any from memory.common import settings, chunker @@ -131,6 +132,7 @@ def summarize(content: str, target_tokens: int | None = None) -> tuple[str, list summary = result.get("summary", "") tags = result.get("tags", []) except Exception as e: + traceback.print_exc() logger.error(f"Summarization failed: {e}") tokens = chunker.approx_token_count(summary) diff --git a/src/memory/workers/tasks/ebook.py b/src/memory/workers/tasks/ebook.py index 189f243..16cafdd 100644 --- a/src/memory/workers/tasks/ebook.py +++ b/src/memory/workers/tasks/ebook.py @@ -60,6 +60,7 @@ def section_processor( end_page=section.end_page, parent_section_id=None, # Will be set after flush content=content, + filename=book.file_path, size=len(content), mime_type="text/plain", sha256=create_content_hash( diff --git a/src/memory/workers/tasks/maintenance.py b/src/memory/workers/tasks/maintenance.py index 998de9b..c831b11 100644 --- a/src/memory/workers/tasks/maintenance.py +++ b/src/memory/workers/tasks/maintenance.py @@ -21,6 +21,7 @@ REINGEST_MISSING_CHUNKS = f"{MAINTENANCE_ROOT}.reingest_missing_chunks" REINGEST_CHUNK = f"{MAINTENANCE_ROOT}.reingest_chunk" REINGEST_ITEM = f"{MAINTENANCE_ROOT}.reingest_item" REINGEST_EMPTY_SOURCE_ITEMS = f"{MAINTENANCE_ROOT}.reingest_empty_source_items" +REINGEST_ALL_EMPTY_SOURCE_ITEMS = f"{MAINTENANCE_ROOT}.reingest_all_empty_source_items" UPDATE_METADATA_FOR_SOURCE_ITEMS = ( f"{MAINTENANCE_ROOT}.update_metadata_for_source_items" ) @@ -76,9 +77,9 @@ def reingest_chunk(chunk_id: str, collection: str): data = chunk.data if collection in collections.MULTIMODAL_COLLECTIONS: - vector = embedding.embed_mixed(data)[0] - elif len(data) == 1 and isinstance(data[0], str): - vector = embedding.embed_text([data[0]])[0] + vector = embedding.embed_mixed([data])[0] + elif collection in collections.TEXT_COLLECTIONS: + vector = embedding.embed_text([data])[0] else: raise ValueError(f"Unsupported data type for collection {collection}") @@ -123,7 +124,10 @@ def reingest_item(item_id: str, item_type: str): chunk_ids = [str(c.id) for c in item.chunks if c.id] if chunk_ids: client = qdrant.get_qdrant_client() - qdrant.delete_points(client, item.modality, chunk_ids) + try: + qdrant.delete_points(client, item.modality, chunk_ids) + except IOError as e: + logger.error(f"Error deleting chunks for {item_id}: {e}") for chunk in item.chunks: session.delete(chunk) @@ -151,6 +155,13 @@ def reingest_empty_source_items(item_type: str): return {"status": "success", "items": len(item_ids)} +@app.task(name=REINGEST_ALL_EMPTY_SOURCE_ITEMS) +def reingest_all_empty_source_items(): + logger.info("Reingesting all empty source items") + for item_type in SourceItem.registry._class_registry.keys(): + reingest_empty_source_items.delay(item_type) # type: ignore + + def check_batch(batch: Sequence[Chunk]) -> dict: client = qdrant.get_qdrant_client() by_collection = defaultdict(list) diff --git a/tests/conftest.py b/tests/conftest.py index aae9b51..e90eec2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -234,8 +234,10 @@ def mock_voyage_client(): def embeder(chunks, *args, **kwargs): return Mock(embeddings=[[0.1] * 1024] * len(chunks)) + real_client = voyageai.Client with patch.object(voyageai, "Client", autospec=True) as mock_client: client = mock_client() + client.real_client = real_client client.embed = embeder client.multimodal_embed = embeder yield client @@ -251,7 +253,7 @@ def mock_openai_client(): choices=[ Mock( message=Mock( - content='{"summary": "test", "tags": ["tag1", "tag2"]}' + content='{"summary": "test summary", "tags": ["tag1", "tag2"]}' ) ) ] @@ -267,7 +269,9 @@ def mock_anthropic_client(): client.messages = Mock() client.messages.create = Mock( return_value=Mock( - content=[Mock(text='{"summary": "test", "tags": ["tag1", "tag2"]}')] + content=[ + Mock(text='{"summary": "test summary", "tags": ["tag1", "tag2"]}') + ] ) ) yield client diff --git a/tests/data/code_complexity.jpg b/tests/data/code_complexity.jpg new file mode 100644 index 0000000..0d0cb0e Binary files /dev/null and b/tests/data/code_complexity.jpg differ diff --git a/tests/data/contents.py b/tests/data/contents.py new file mode 100644 index 0000000..44ce152 --- /dev/null +++ b/tests/data/contents.py @@ -0,0 +1,249 @@ +import hashlib +import pathlib +from bs4 import BeautifulSoup +from markdownify import markdownify +from PIL import Image + +DATA_DIR = pathlib.Path(__file__).parent + +SAMPLE_HTML = f""" + + +

The Evolution of Programming Languages

+ +

Programming languages have undergone tremendous evolution since the early days of computing. + From the machine code and assembly languages of the 1940s to the high-level, expressive languages + we use today, each generation has built upon the lessons learned from its predecessors. Languages + like FORTRAN and COBOL pioneered the concept of human-readable code, while later innovations like + object-oriented programming in languages such as Smalltalk and C++ revolutionized how we structure + and organize our programs.

+ + Timeline of programming language evolution + +

The rise of functional programming paradigms has brought mathematical rigor and immutability + to the forefront of software development. Languages like Haskell, Lisp, and more recently Rust + and Elm have demonstrated the power of pure functions and type systems in creating more reliable + and maintainable code. These paradigms emphasize the elimination of side effects and the treatment + of computation as the evaluation of mathematical functions.

+ patch +

Modern development has also seen the emergence of domain-specific languages and the resurgence + of interest in memory safety. The advent of languages like Python and JavaScript has democratized + programming by lowering the barrier to entry, while systems languages like Rust have proven that + performance and safety need not be mutually exclusive. The ongoing development of WebAssembly + promises to bring high-performance computing to web browsers in ways previously unimaginable.

+ + Visual representation of code complexity over time + +

Looking toward the future, we see emerging trends in quantum programming languages, AI-assisted + code generation, and the continued evolution toward more expressive type systems. The challenge + for tomorrow's language designers will be balancing expressiveness with simplicity, performance + with safety, and innovation with backward compatibility. As computing continues to permeate every + aspect of human life, the languages we use to command these machines will undoubtedly continue + to evolve and shape the digital landscape.

+ +

The emergence of cloud computing and distributed systems has also driven new paradigms in + language design. Languages like Go and Elixir have been specifically crafted to excel in + concurrent and distributed environments, while the rise of microservices has renewed interest + in polyglot programming approaches. These developments reflect a broader shift toward languages + that are not just powerful tools for individual developers, but robust foundations for building + scalable, resilient systems that can handle the demands of modern internet-scale applications.

+ +

Perhaps most intriguingly, the intersection of programming languages with artificial intelligence + is opening entirely new frontiers. Differentiable programming languages are enabling new forms of + machine learning research, while large language models are beginning to reshape how we think about + code generation and developer tooling. As we stand on the brink of an era where AI systems may + become active participants in the programming process itself, the very nature of what constitutes + a programming language—and who or what programs in it—may be fundamentally transformed.

+ + +""" +SECOND_PAGE = """ +
+

The Impact of Open Source on Language Development

+ +

The open source movement has fundamentally transformed how programming languages are developed, + distributed, and evolved. Unlike the proprietary languages of earlier decades, modern language + development often occurs in public repositories where thousands of contributors can participate + in the design process. Languages like Python, JavaScript, and Rust have benefited enormously + from this collaborative approach, with their ecosystems growing rapidly through community-driven + package managers and extensive third-party libraries.

+ +

This democratization of language development has led to faster innovation cycles and more + responsive adaptation to developer needs. When a language feature proves problematic or a new + paradigm emerges, open source languages can quickly incorporate changes through their community + governance processes. The result has been an unprecedented period of language experimentation + and refinement, where ideas can be tested, refined, and adopted across multiple language + communities simultaneously.

+ +

Furthermore, the open source model has enabled the rise of domain-specific languages that + might never have been commercially viable under traditional development models. From specialized + query languages for databases to configuration management tools, the low barrier to entry for + language creation has fostered an explosion of linguistic diversity in computing, each tool + optimized for specific problem domains and user communities.

+ +

The collaborative nature of open source development has also revolutionized language tooling + and developer experience. Modern languages benefit from rich ecosystems of editors, debuggers, + profilers, and static analysis tools, all developed by passionate communities who understand + the daily challenges faced by practitioners. This has created a virtuous cycle where better + tooling attracts more developers, who in turn contribute improvements that make the language + even more accessible and powerful.

+ +

Version control systems like Git have enabled unprecedented transparency in language evolution, + allowing developers to trace the reasoning behind every design decision through detailed commit + histories and issue discussions. This historical record serves not only as documentation but as + a learning resource for future language designers, helping them understand the trade-offs and + considerations that shaped successful language features.

+ +

The economic implications of open source language development cannot be overstated. By removing + licensing barriers and vendor lock-in, open source languages have democratized access to powerful + programming tools across the globe. This has enabled innovation in regions and sectors that might + otherwise have been excluded from the software revolution, fostering a truly global community of + software creators and problem solvers.

+
+""" + +CHUNKS: list[str] = [ + """The Evolution of Programming Languages +====================================== +Programming languages have undergone tremendous evolution since the early days of computing. + From the machine code and assembly languages of the 1940s to the high\\-level, expressive languages + we use today, each generation has built upon the lessons learned from its predecessors. Languages + like FORTRAN and COBOL pioneered the concept of human\\-readable code, while later innovations like + object\\-oriented programming in languages such as Smalltalk and C\\+\\+ revolutionized how we structure + and organize our programs. +![Timeline of programming language evolution](/Users/dan/code/memory/tests/data/lang_timeline.png) +The rise of functional programming paradigms has brought mathematical rigor and immutability + to the forefront of software development. Languages like Haskell, Lisp, and more recently Rust + and Elm have demonstrated the power of pure functions and type systems in creating more reliable + and maintainable code. These paradigms emphasize the elimination of side effects and the treatment + of computation as the evaluation of mathematical functions. +Modern development has also seen the emergence of domain\\-specific languages and the resurgence + of interest in memory safety. The advent of languages like Python and JavaScript has democratized + programming by lowering the barrier to entry, while systems languages like Rust have proven that + performance and safety need not be mutually exclusive. The ongoing development of WebAssembly + promises to bring high\\-performance computing to web browsers in ways previously unimaginable. +![Visual representation of code complexity over time](/Users/dan/code/memory/tests/data/code_complexity.jpg) +Looking toward the future, we see emerging trends in quantum programming languages, AI\\-assisted + code generation, and the continued evolution toward more expressive type systems. The challenge + for tomorrow's language designers will be balancing expressiveness with simplicity, performance + with safety, and innovation with backward compatibility. As computing continues to permeate every + aspect of human life, the languages we use to command these machines will undoubtedly continue + to evolve and shape the digital landscape.""", + """ +As computing continues to permeate every + aspect of human life, the languages we use to command these machines will undoubtedly continue + to evolve and shape the digital landscape. +The emergence of cloud computing and distributed systems has also driven new paradigms in + language design. Languages like Go and Elixir have been specifically crafted to excel in + concurrent and distributed environments, while the rise of microservices has renewed interest + in polyglot programming approaches. These developments reflect a broader shift toward languages + that are not just powerful tools for individual developers, but robust foundations for building + scalable, resilient systems that can handle the demands of modern internet\\-scale applications. +Perhaps most intriguingly, the intersection of programming languages with artificial intelligence + is opening entirely new frontiers. Differentiable programming languages are enabling new forms of + machine learning research, while large language models are beginning to reshape how we think about + code generation and developer tooling. As we stand on the brink of an era where AI systems may + become active participants in the programming process itself, the very nature of what constitutes + a programming language—and who or what programs in it—may be fundamentally transformed.""", +] +TWO_PAGE_CHUNKS: list[str] = [ + """ +The Evolution of Programming Languages +====================================== +Programming languages have undergone tremendous evolution since the early days of computing. + From the machine code and assembly languages of the 1940s to the high\-level, expressive languages + we use today, each generation has built upon the lessons learned from its predecessors. Languages + like FORTRAN and COBOL pioneered the concept of human\-readable code, while later innovations like + object\-oriented programming in languages such as Smalltalk and C\+\+ revolutionized how we structure + and organize our programs. +![Timeline of programming language evolution](/Users/dan/code/memory/tests/data/lang_timeline.png) +The rise of functional programming paradigms has brought mathematical rigor and immutability + to the forefront of software development. Languages like Haskell, Lisp, and more recently Rust + and Elm have demonstrated the power of pure functions and type systems in creating more reliable + and maintainable code. These paradigms emphasize the elimination of side effects and the treatment + of computation as the evaluation of mathematical functions. +Modern development has also seen the emergence of domain\-specific languages and the resurgence + of interest in memory safety. The advent of languages like Python and JavaScript has democratized + programming by lowering the barrier to entry, while systems languages like Rust have proven that + performance and safety need not be mutually exclusive. The ongoing development of WebAssembly + promises to bring high\-performance computing to web browsers in ways previously unimaginable. +![Visual representation of code complexity over time](/Users/dan/code/memory/tests/data/code_complexity.jpg) +Looking toward the future, we see emerging trends in quantum programming languages, AI\-assisted + code generation, and the continued evolution toward more expressive type systems. The challenge + for tomorrow's language designers will be balancing expressiveness with simplicity, performance + with safety, and innovation with backward compatibility. As computing continues to permeate every + aspect of human life, the languages we use to command these machines will undoubtedly continue + to evolve and shape the digital landscape. +""", + """ +As computing continues to permeate every + aspect of human life, the languages we use to command these machines will undoubtedly continue + to evolve and shape the digital landscape. +The emergence of cloud computing and distributed systems has also driven new paradigms in + language design. Languages like Go and Elixir have been specifically crafted to excel in + concurrent and distributed environments, while the rise of microservices has renewed interest + in polyglot programming approaches. These developments reflect a broader shift toward languages + that are not just powerful tools for individual developers, but robust foundations for building + scalable, resilient systems that can handle the demands of modern internet\-scale applications. +Perhaps most intriguingly, the intersection of programming languages with artificial intelligence + is opening entirely new frontiers. Differentiable programming languages are enabling new forms of + machine learning research, while large language models are beginning to reshape how we think about + code generation and developer tooling. As we stand on the brink of an era where AI systems may + become active participants in the programming process itself, the very nature of what constitutes + a programming language—and who or what programs in it—may be fundamentally transformed. +The Impact of Open Source on Language Development +------------------------------------------------- +The open source movement has fundamentally transformed how programming languages are developed, + distributed, and evolved. Unlike the proprietary languages of earlier decades, modern language + development often occurs in public repositories where thousands of contributors can participate + in the design process. Languages like Python, JavaScript, and Rust have benefited enormously + from this collaborative approach, with their ecosystems growing rapidly through community\-driven + package managers and extensive third\-party libraries. +This democratization of language development has led to faster innovation cycles and more + responsive adaptation to developer needs. When a language feature proves problematic or a new + paradigm emerges, open source languages can quickly incorporate changes through their community + governance processes. The result has been an unprecedented period of language experimentation + and refinement, where ideas can be tested, refined, and adopted across multiple language + communities simultaneously.""", + """ +The result has been an unprecedented period of language experimentation + and refinement, where ideas can be tested, refined, and adopted across multiple language + communities simultaneously. +Furthermore, the open source model has enabled the rise of domain\-specific languages that + might never have been commercially viable under traditional development models. From specialized + query languages for databases to configuration management tools, the low barrier to entry for + language creation has fostered an explosion of linguistic diversity in computing, each tool + optimized for specific problem domains and user communities. +The collaborative nature of open source development has also revolutionized language tooling + and developer experience. Modern languages benefit from rich ecosystems of editors, debuggers, + profilers, and static analysis tools, all developed by passionate communities who understand + the daily challenges faced by practitioners. This has created a virtuous cycle where better + tooling attracts more developers, who in turn contribute improvements that make the language + even more accessible and powerful. +Version control systems like Git have enabled unprecedented transparency in language evolution, + allowing developers to trace the reasoning behind every design decision through detailed commit + histories and issue discussions. This historical record serves not only as documentation but as + a learning resource for future language designers, helping them understand the trade\-offs and + considerations that shaped successful language features. +The economic implications of open source language development cannot be overstated. By removing + licensing barriers and vendor lock\-in, open source languages have democratized access to powerful + programming tools across the globe. This has enabled innovation in regions and sectors that might + otherwise have been excluded from the software revolution, fostering a truly global community of + software creators and problem solvers. +""", +] + +SAMPLE_MARKDOWN = markdownify(SAMPLE_HTML) +SAMPLE_TEXT = BeautifulSoup(SAMPLE_HTML, "html.parser").get_text() +SECOND_PAGE_MARKDOWN = markdownify(SECOND_PAGE) +SECOND_PAGE_TEXT = BeautifulSoup(SECOND_PAGE, "html.parser").get_text() + + +def image_hash(image: Image.Image) -> str: + return hashlib.sha256(image.tobytes()).hexdigest() + + +LANG_TIMELINE = Image.open(DATA_DIR / "lang_timeline.png") +CODE_COMPLEXITY = Image.open(DATA_DIR / "code_complexity.jpg") +LANG_TIMELINE_HASH = image_hash(LANG_TIMELINE) +CODE_COMPLEXITY_HASH = image_hash(CODE_COMPLEXITY) diff --git a/tests/data/lang_timeline.png b/tests/data/lang_timeline.png new file mode 100644 index 0000000..5201eb9 Binary files /dev/null and b/tests/data/lang_timeline.png differ diff --git a/tests/integration/test_real_queries.py b/tests/integration/test_real_queries.py new file mode 100644 index 0000000..36f0415 --- /dev/null +++ b/tests/integration/test_real_queries.py @@ -0,0 +1,1118 @@ +import hashlib +import itertools +from datetime import datetime +from unittest.mock import patch + +import pytest +import voyageai + +import memory.common.qdrant as qdrant_tools +from memory.common import extract +from memory.common.db.models.source_item import SourceItem +from memory.common.db.models.source_items import ( + AgentObservation, +) +from memory.common.embedding import embed_source_item, embed_text +from memory.workers.tasks.content_processing import push_to_qdrant +from tests.data.contents import SAMPLE_MARKDOWN + + +@pytest.fixture +def real_voyage_client(mock_voyage_client): + real_client = mock_voyage_client.real_client + with patch.object(voyageai, "Client", real_client): + yield real_client + + +def test_real_source_item_embeddings(real_voyage_client, qdrant): + item = SourceItem( + id=1, + content=SAMPLE_MARKDOWN, + mime_type="text/html", + modality="text", + sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), + size=len(SAMPLE_MARKDOWN), + tags=["bla"], + embed_status="QUEUED", + ) + + part1, part2, summary = embed_source_item(item) + part1.id = "00000000-0000-0000-0000-000000000000" # type: ignore + part2.id = "00000000-0000-0000-0000-000000000001" # type: ignore + summary.id = "00000000-0000-0000-0000-000000000002" # type: ignore + push_to_qdrant([item]) + + queries = { + "how have programming languages changed?": [0.6756747, 0.6319432, 0.26348075], + "evolution of programming languages since 1940s": [0.690, 0.594, 0.330], + "functional programming paradigms and immutability": [0.507, 0.412, 0.276], + "memory safety in systems programming languages": [0.487, 0.458, 0.348], + "FORTRAN and COBOL pioneering human-readable code": [0.535, 0.458, 0.296], + "Rust and type systems for reliable code": [0.585, 0.506, 0.456], + "WebAssembly high-performance web computing": [0.469, 0.426, 0.296], + "object-oriented programming innovations": [0.510, 0.492, 0.333], + "cloud computing and distributed systems": [0.40005407, 0.56048, 0.37348732], + "AI-assisted code generation trends": [0.51078045, 0.5828345, 0.31309962], + "microservices and polyglot programming": [0.5072756, 0.63991153, 0.38507754], + "Python JavaScript democratizing programming": [0.524, 0.517, 0.320], + "software development methodologies": [0.454, 0.440, 0.356], + "computer science history": [0.517, 0.454, 0.299], + "programming paradigms comparison": [0.589, 0.525, 0.352], + "developer tools and ecosystems": [0.42297083, 0.52246743, 0.39521465], + "modern computing trends": [0.5172996, 0.5883902, 0.30886292], + "database query languages": [0.47420773, 0.48987937, 0.41980737], + "network programming protocols": [0.3547029, 0.42228842, 0.39325726], + "machine learning algorithms": [0.39660394, 0.47512275, 0.45423454], + "web browser technologies": [0.467, 0.449, 0.439], + "software architecture patterns": [0.4430701, 0.4969077, 0.3775082], + "mobile app user interface design": [0.2754, 0.332, 0.3863], + "cybersecurity threat detection": [0.3436677, 0.38349956, 0.36111486], + "project management methodologies": [0.3377, 0.34, 0.3573], + "cooking Italian pasta recipes": [0.2627, 0.2388, 0.3065], + "professional basketball statistics": [0.2811, 0.2454, 0.3411], + "gardening tips for beginners": [0.2953, 0.2848, 0.3309], + "travel destinations in Europe": [0.2595, 0.2514, 0.3039], + "classical music composers": [0.3066, 0.2838, 0.3173], + } + for query, (p1, p2, s) in queries.items(): + search_vector = embed_text( + [extract.DataChunk(data=[query])], input_type="query" + )[0] + results = qdrant_tools.search_vectors(qdrant, "text", search_vector) + expected = sorted( + [ + (part1.id, p1), + (part2.id, p2), + (summary.id, s), + ], + key=lambda x: x[1], + reverse=True, + ) + assert [(r.id, pytest.approx(r.score, abs=0.1)) for r in results] == expected + + +EXPECTED_OBSERVATION_RESULTS = { + "What does the user think about functional programming?": { + "semantic": [ + ( + 0.7104, + "The user believes functional programming leads to better code quality", + ), + (0.6792, "I prefer functional programming over OOP"), + ( + 0.6772, + "Subject: programming_philosophy | Type: belief | Observation: The user believes functional programming leads to better code quality | Quote: Functional programming produces more maintainable code", + ), + ( + 0.6677, + "Subject: programming_paradigms | Type: preference | Observation: The user prefers functional programming over OOP | Quote: I prefer functional programming over OOP", + ), + ], + "temporal": [ + ( + 0.5816, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + ), + ( + 0.5246, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8", + ), + ( + 0.5214, + "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8", + ), + ( + 0.4645, + "Time: 12:00 on Wednesday (afternoon) | Subject: refactoring | Observation: The user always refactors to pure functions | Confidence: 0.8", + ), + ], + }, + "Does the user prefer functional or object-oriented programming?": { + "semantic": [ + (0.7718, "The user prefers functional programming over OOP"), + ( + 0.754, + "Subject: programming_paradigms | Type: preference | Observation: The user prefers functional programming over OOP | Quote: I prefer functional programming over OOP", + ), + (0.7454, "I prefer functional programming over OOP"), + ( + 0.6541, + "Subject: programming_philosophy | Type: belief | Observation: The user believes functional programming leads to better code quality | Quote: Functional programming produces more maintainable code", + ), + ], + "temporal": [ + ( + 0.6188, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8", + ), + ( + 0.5902, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + ), + ( + 0.5144, + "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8", + ), + ( + 0.4989, + "Time: 12:00 on Wednesday (afternoon) | Subject: refactoring | Observation: The user always refactors to pure functions | Confidence: 0.8", + ), + ], + }, + "What are the user's beliefs about code quality?": { + "semantic": [ + (0.6925, "The user believes code reviews are essential for quality"), + ( + 0.68, + "The user believes functional programming leads to better code quality", + ), + ( + 0.6524, + "Subject: code_quality | Type: belief | Observation: The user believes code reviews are essential for quality | Quote: Code reviews catch bugs that automated testing misses", + ), + ( + 0.6466, + "Subject: programming_philosophy | Type: belief | Observation: The user believes functional programming leads to better code quality | Quote: Functional programming produces more maintainable code", + ), + ], + "temporal": [ + ( + 0.5544, + "Time: 12:00 on Wednesday (afternoon) | Subject: code_quality | Observation: The user believes code reviews are essential for quality | Confidence: 0.8", + ), + ( + 0.5397, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + ), + ( + 0.4931, + "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8", + ), + ( + 0.4674, + "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8", + ), + ], + }, + "How does the user approach debugging code?": { + "semantic": [ + ( + 0.7011, + "Subject: debugging_approach | Type: behavior | Observation: The user debugs by adding print statements rather than using a debugger | Quote: When debugging, I just add console.log everywhere", + ), + ( + 0.6962, + "The user debugs by adding print statements rather than using a debugger", + ), + (0.6788, "When debugging, I just add console.log everywhere"), + ( + 0.5357, + "Subject: code_quality | Type: belief | Observation: The user believes code reviews are essential for quality | Quote: Code reviews catch bugs that automated testing misses", + ), + ], + "temporal": [ + ( + 0.6252, + "Time: 12:00 on Wednesday (afternoon) | Subject: debugging_approach | Observation: The user debugs by adding print statements rather than using a debugger | Confidence: 0.8", + ), + ( + 0.476, + "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8", + ), + ( + 0.4424, + "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches | Confidence: 0.8", + ), + ( + 0.4402, + "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8", + ), + ], + }, + "What are the user's git and version control habits?": { + "semantic": [ + ( + 0.6474, + "Subject: version_control_style | Type: preference | Observation: The user prefers small, focused commits over large feature branches | Quote: I like to commit small, logical changes frequently", + ), + (0.6424, "I like to commit small, logical changes frequently"), + ( + 0.5961, + "The user prefers small, focused commits over large feature branches", + ), + ( + 0.5806, + "Subject: git_habits | Type: behavior | Observation: The user writes commit messages in present tense | Quote: Fix bug in parser instead of Fixed bug in parser", + ), + ], + "temporal": [ + ( + 0.6174, + "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches | Confidence: 0.8", + ), + ( + 0.5733, + "Time: 12:00 on Wednesday (afternoon) | Subject: git_habits | Observation: The user writes commit messages in present tense | Confidence: 0.8", + ), + ( + 0.4848, + "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8", + ), + ( + 0.4604, + "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8", + ), + ], + }, + "When does the user prefer to work?": { + "semantic": [ + (0.6806, "The user prefers working late at night"), + ( + 0.6792, + "Subject: work_schedule | Type: behavior | Observation: The user prefers working late at night | Quote: I do my best coding between 10pm and 2am", + ), + (0.6439, "I do my best coding between 10pm and 2am"), + (0.5528, "I use 25-minute work intervals with 5-minute breaks"), + ], + "temporal": [ + ( + 0.7023, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_schedule | Observation: The user prefers working late at night | Confidence: 0.8", + ), + ( + 0.6395, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + ), + ( + 0.6375, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8", + ), + ( + 0.6254, + "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8", + ), + ], + }, + "How does the user handle productivity and time management?": { + "semantic": [ + ( + 0.579, + "Subject: productivity_methods | Type: behavior | Observation: The user takes breaks every 25 minutes using the Pomodoro technique | Quote: I use 25-minute work intervals with 5-minute breaks", + ), + (0.5731, "I use 25-minute work intervals with 5-minute breaks"), + ( + 0.5284, + "The user takes breaks every 25 minutes using the Pomodoro technique", + ), + (0.5153, "I do my best coding between 10pm and 2am"), + ], + "temporal": [ + ( + 0.5705, + "Time: 12:00 on Wednesday (afternoon) | Subject: productivity_methods | Observation: The user takes breaks every 25 minutes using the Pomodoro technique | Confidence: 0.8", + ), + ( + 0.5023, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8", + ), + ( + 0.4631, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_schedule | Observation: The user prefers working late at night | Confidence: 0.8", + ), + ( + 0.4626, + "Time: 12:00 on Wednesday (afternoon) | Subject: documentation_habits | Observation: The user always writes documentation before implementing features | Confidence: 0.8", + ), + ], + }, + "What editor does the user prefer?": { + "semantic": [ + ( + 0.6394, + "Subject: editor_preference | Type: preference | Observation: The user prefers Vim over VS Code for editing | Quote: Vim makes me more productive than any modern editor", + ), + (0.6241, "The user prefers Vim over VS Code for editing"), + (0.5528, "Vim makes me more productive than any modern editor"), + (0.4887, "The user claims to prefer tabs but their code uses spaces"), + ], + "temporal": [ + ( + 0.5701, + "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8", + ), + ( + 0.4557, + "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8", + ), + ( + 0.4322, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + ), + ( + 0.4283, + "Time: 12:00 on Wednesday (afternoon) | Subject: database_preference | Observation: The user prefers PostgreSQL over MongoDB for most applications | Confidence: 0.8", + ), + ], + }, + "What databases does the user like to use?": { + "semantic": [ + ( + 0.6328, + "Subject: database_preference | Type: preference | Observation: The user prefers PostgreSQL over MongoDB for most applications | Quote: Relational databases handle complex queries better than document stores", + ), + (0.5992, "The user prefers PostgreSQL over MongoDB for most applications"), + ( + 0.5352, + "Subject: domain_preference | Type: preference | Observation: The user prefers working on backend systems over frontend UI | Quote: I find backend logic more interesting than UI work", + ), + (0.5186, "The user prefers working on backend systems over frontend UI"), + ], + "temporal": [ + ( + 0.5599, + "Time: 12:00 on Wednesday (afternoon) | Subject: database_preference | Observation: The user prefers PostgreSQL over MongoDB for most applications | Confidence: 0.8", + ), + ( + 0.4617, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + ), + ( + 0.4445, + "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8", + ), + ( + 0.4365, + "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8", + ), + ], + }, + "What programming languages does the user work with?": { + "semantic": [ + (0.7255, "The user primarily works with Python and JavaScript"), + (0.6954, "Most of my work is in Python backend and React frontend"), + ( + 0.6874, + "Subject: primary_languages | Type: general | Observation: The user primarily works with Python and JavaScript | Quote: Most of my work is in Python backend and React frontend", + ), + (0.6098, "I'm picking up Rust on weekends"), + ], + "temporal": [ + ( + 0.5939, + "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8", + ), + ( + 0.4679, + "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience | Confidence: 0.8", + ), + ( + 0.4623, + "Time: 12:00 on Wednesday (afternoon) | Subject: learning_activities | Observation: The user is currently learning Rust in their spare time | Confidence: 0.8", + ), + ( + 0.4514, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + ), + ], + }, + "What is the user's programming experience level?": { + "semantic": [ + (0.6664, "The user has 8 years of professional programming experience"), + ( + 0.6565, + "Subject: experience_level | Type: general | Observation: The user has 8 years of professional programming experience | Quote: I've been coding professionally for 8 years", + ), + (0.5949, "I've been coding professionally for 8 years"), + (0.5641, "The user is currently learning Rust in their spare time"), + ], + "temporal": [ + ( + 0.5991, + "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience | Confidence: 0.8", + ), + ( + 0.5041, + "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8", + ), + ( + 0.4917, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + ), + ( + 0.4817, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8", + ), + ], + }, + "Where did the user study computer science?": { + "semantic": [ + (0.6863, "I studied CS at Stanford"), + (0.649, "The user graduated with a Computer Science degree from Stanford"), + ( + 0.6344, + "Subject: education_background | Type: general | Observation: The user graduated with a Computer Science degree from Stanford | Quote: I studied CS at Stanford", + ), + (0.4592, "The user is currently learning Rust in their spare time"), + ], + "temporal": [ + ( + 0.5455, + "Time: 12:00 on Wednesday (afternoon) | Subject: education_background | Observation: The user graduated with a Computer Science degree from Stanford | Confidence: 0.8", + ), + ( + 0.3842, + "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience | Confidence: 0.8", + ), + ( + 0.3792, + "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8", + ), + ( + 0.3781, + "Time: 12:00 on Wednesday (afternoon) | Subject: learning_activities | Observation: The user is currently learning Rust in their spare time | Confidence: 0.8", + ), + ], + }, + "What kind of company does the user work at?": { + "semantic": [ + (0.6308, "The user works at a mid-size startup with 50 employees"), + ( + 0.5371, + "Subject: company_size | Type: general | Observation: The user works at a mid-size startup with 50 employees | Quote: Our company has about 50 people", + ), + (0.5253, "Most of my work is in Python backend and React frontend"), + (0.4902, "I've been coding professionally for 8 years"), + ], + "temporal": [ + ( + 0.5309, + "Time: 12:00 on Wednesday (afternoon) | Subject: company_size | Observation: The user works at a mid-size startup with 50 employees | Confidence: 0.8", + ), + ( + 0.4329, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8", + ), + ( + 0.4323, + "Time: 12:00 on Wednesday (afternoon) | Subject: education_background | Observation: The user graduated with a Computer Science degree from Stanford | Confidence: 0.8", + ), + ( + 0.419, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_schedule | Observation: The user prefers working late at night | Confidence: 0.8", + ), + ], + }, + "What does the user think about AI replacing programmers?": { + "semantic": [ + ( + 0.5965, + "Subject: ai_future | Type: belief | Observation: The user thinks AI will replace most software developers within 10 years | Quote: AI will make most programmers obsolete by 2035", + ), + ( + 0.572, + "The user thinks AI will replace most software developers within 10 years", + ), + (0.5715, "AI will make most programmers obsolete by 2035"), + ( + 0.4344, + "The user believes functional programming leads to better code quality", + ), + ], + "temporal": [ + ( + 0.4629, + "Time: 12:00 on Wednesday (afternoon) | Subject: ai_future | Observation: The user thinks AI will replace most software developers within 10 years | Confidence: 0.8", + ), + ( + 0.362, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + ), + ( + 0.3308, + "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8", + ), + ( + 0.328, + "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8", + ), + ], + }, + "What are the user's views on artificial intelligence?": { + "semantic": [ + ( + 0.5885, + "Subject: ai_future | Type: belief | Observation: The user thinks AI will replace most software developers within 10 years | Quote: AI will make most programmers obsolete by 2035", + ), + ( + 0.5661, + "The user thinks AI will replace most software developers within 10 years", + ), + (0.5133, "AI will make most programmers obsolete by 2035"), + (0.4927, "I find backend logic more interesting than UI work"), + ], + "temporal": [ + ( + 0.5399, + "Time: 12:00 on Wednesday (afternoon) | Subject: ai_future | Observation: The user thinks AI will replace most software developers within 10 years | Confidence: 0.8", + ), + ( + 0.4353, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + ), + ( + 0.4223, + "Time: 12:00 on Wednesday (afternoon) | Subject: humans | Observation: The user thinks that all men must die. | Confidence: 0.8", + ), + ( + 0.4219, + "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8", + ), + ], + }, + "Has the user changed their mind about TypeScript?": { + "semantic": [ + ( + 0.6174, + "The user now says they love TypeScript but previously called it verbose", + ), + ( + 0.5757, + "Subject: typescript_opinion | Type: contradiction | Observation: The user now says they love TypeScript but previously called it verbose | Quote: TypeScript has too much boilerplate vs TypeScript makes my code so much cleaner", + ), + ( + 0.4924, + "TypeScript has too much boilerplate vs TypeScript makes my code so much cleaner", + ), + (0.4157, "The user always refactors to pure functions"), + ], + "temporal": [ + ( + 0.5631, + "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8", + ), + ( + 0.4016, + "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8", + ), + ( + 0.3827, + "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8", + ), + ( + 0.3825, + "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8", + ), + ], + }, + "Are there any contradictions in the user's preferences?": { + "semantic": [ + (0.536, "The user claims to prefer tabs but their code uses spaces"), + ( + 0.5353, + "Subject: indentation_preference | Type: contradiction | Observation: The user claims to prefer tabs but their code uses spaces | Quote: Tabs are better than spaces vs code consistently uses 2-space indentation", + ), + ( + 0.5321, + "Subject: pure_functions | Type: contradiction | Observation: The user said pure functions are yucky | Quote: Pure functions are yucky", + ), + ( + 0.5058, + "Subject: typescript_opinion | Type: contradiction | Observation: The user now says they love TypeScript but previously called it verbose | Quote: TypeScript has too much boilerplate vs TypeScript makes my code so much cleaner", + ), + ], + "temporal": [ + ( + 0.4763, + "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8", + ), + ( + 0.4693, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + ), + ( + 0.4681, + "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8", + ), + ( + 0.4586, + "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8", + ), + ], + }, + "What does the user think about software testing?": { + "semantic": [ + ( + 0.6386, + "Subject: testing_philosophy | Type: belief | Observation: The user believes unit tests are a waste of time for prototypes | Quote: Writing tests for throwaway code slows development", + ), + (0.6222, "The user believes unit tests are a waste of time for prototypes"), + ( + 0.6152, + "Subject: code_quality | Type: belief | Observation: The user believes code reviews are essential for quality | Quote: Code reviews catch bugs that automated testing misses", + ), + (0.6036, "The user believes code reviews are essential for quality"), + ], + "temporal": [ + ( + 0.5881, + "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8", + ), + ( + 0.5074, + "Time: 12:00 on Wednesday (afternoon) | Subject: code_quality | Observation: The user believes code reviews are essential for quality | Confidence: 0.8", + ), + ( + 0.4863, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + ), + ( + 0.4748, + "Time: 12:00 on Wednesday (afternoon) | Subject: debugging_approach | Observation: The user debugs by adding print statements rather than using a debugger | Confidence: 0.8", + ), + ], + }, + "How does the user approach documentation?": { + "semantic": [ + ( + 0.5966, + "Subject: documentation_habits | Type: behavior | Observation: The user always writes documentation before implementing features | Quote: I document the API design before writing any code", + ), + ( + 0.5473, + "The user always writes documentation before implementing features", + ), + (0.5207, "I document the API design before writing any code"), + ( + 0.4954, + "Subject: debugging_approach | Type: behavior | Observation: The user debugs by adding print statements rather than using a debugger | Quote: When debugging, I just add console.log everywhere", + ), + ], + "temporal": [ + ( + 0.4988, + "Time: 12:00 on Wednesday (afternoon) | Subject: documentation_habits | Observation: The user always writes documentation before implementing features | Confidence: 0.8", + ), + ( + 0.4335, + "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8", + ), + ( + 0.4316, + "Time: 12:00 on Wednesday (afternoon) | Subject: debugging_approach | Observation: The user debugs by adding print statements rather than using a debugger | Confidence: 0.8", + ), + ( + 0.4307, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + ), + ], + }, + "What are the user's collaboration preferences?": { + "semantic": [ + ( + 0.651, + "Subject: collaboration_preference | Type: preference | Observation: The user prefers pair programming for complex problems | Quote: Two heads are better than one when solving hard problems", + ), + (0.5848, "The user prefers pair programming for complex problems"), + ( + 0.5355, + "Subject: version_control_style | Type: preference | Observation: The user prefers small, focused commits over large feature branches | Quote: I like to commit small, logical changes frequently", + ), + ( + 0.5216, + "Subject: domain_preference | Type: preference | Observation: The user prefers working on backend systems over frontend UI | Quote: I find backend logic more interesting than UI work", + ), + ], + "temporal": [ + ( + 0.6027, + "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8", + ), + ( + 0.5101, + "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches | Confidence: 0.8", + ), + ( + 0.482, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + ), + ( + 0.4782, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8", + ), + ], + }, + "What does the user think about remote work?": { + "semantic": [ + (0.7063, "The user thinks remote work is more productive than office work"), + ( + 0.6583, + "Subject: work_environment | Type: belief | Observation: The user thinks remote work is more productive than office work | Quote: I get more done working from home", + ), + (0.6032, "I get more done working from home"), + (0.4997, "The user prefers working on backend systems over frontend UI"), + ], + "temporal": [ + ( + 0.5934, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8", + ), + ( + 0.4173, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_schedule | Observation: The user prefers working late at night | Confidence: 0.8", + ), + ( + 0.4148, + "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8", + ), + ( + 0.4121, + "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8", + ), + ], + }, + "What are the user's productivity methods?": { + "semantic": [ + ( + 0.5723, + "Subject: productivity_methods | Type: behavior | Observation: The user takes breaks every 25 minutes using the Pomodoro technique | Quote: I use 25-minute work intervals with 5-minute breaks", + ), + ( + 0.5261, + "The user takes breaks every 25 minutes using the Pomodoro technique", + ), + (0.5205, "I use 25-minute work intervals with 5-minute breaks"), + (0.5107, "The user thinks remote work is more productive than office work"), + ], + "temporal": [ + ( + 0.5427, + "Time: 12:00 on Wednesday (afternoon) | Subject: productivity_methods | Observation: The user takes breaks every 25 minutes using the Pomodoro technique | Confidence: 0.8", + ), + ( + 0.4743, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8", + ), + ( + 0.4299, + "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8", + ), + ( + 0.4227, + "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches | Confidence: 0.8", + ), + ], + }, + "What technical skills is the user learning?": { + "semantic": [ + (0.5765, "The user is currently learning Rust in their spare time"), + ( + 0.5502, + "Subject: learning_activities | Type: general | Observation: The user is currently learning Rust in their spare time | Quote: I'm picking up Rust on weekends", + ), + (0.5411, "I'm picking up Rust on weekends"), + (0.5155, "The user primarily works with Python and JavaScript"), + ], + "temporal": [ + ( + 0.5301, + "Time: 12:00 on Wednesday (afternoon) | Subject: learning_activities | Observation: The user is currently learning Rust in their spare time | Confidence: 0.8", + ), + ( + 0.4913, + "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8", + ), + ( + 0.481, + "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience | Confidence: 0.8", + ), + ( + 0.4558, + "Time: 12:00 on Wednesday (afternoon) | Subject: education_background | Observation: The user graduated with a Computer Science degree from Stanford | Confidence: 0.8", + ), + ], + }, + "What does the user think about cooking?": { + "semantic": [ + (0.4888, "I find backend logic more interesting than UI work"), + (0.4624, "The user prefers working on backend systems over frontend UI"), + ( + 0.4551, + "The user believes functional programming leads to better code quality", + ), + (0.4547, "The user said pure functions are yucky"), + ], + "temporal": [ + ( + 0.3812, + "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8", + ), + ( + 0.3773, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + ), + ( + 0.3686, + "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8", + ), + ( + 0.3649, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + ), + ], + }, + "What are the user's travel preferences?": { + "semantic": [ + ( + 0.522, + "Subject: domain_preference | Type: preference | Observation: The user prefers working on backend systems over frontend UI | Quote: I find backend logic more interesting than UI work", + ), + (0.5145, "The user prefers functional programming over OOP"), + (0.5079, "The user prefers working on backend systems over frontend UI"), + (0.5045, "The user prefers working late at night"), + ], + "temporal": [ + ( + 0.4849, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + ), + ( + 0.4779, + "Time: 12:00 on Wednesday (afternoon) | Subject: database_preference | Observation: The user prefers PostgreSQL over MongoDB for most applications | Confidence: 0.8", + ), + ( + 0.4659, + "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8", + ), + ( + 0.4639, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8", + ), + ], + }, + "What music does the user like?": { + "semantic": [ + ( + 0.4927, + "Subject: domain_preference | Type: preference | Observation: The user prefers working on backend systems over frontend UI | Quote: I find backend logic more interesting than UI work", + ), + (0.4906, "The user prefers working late at night"), + (0.4904, "The user prefers functional programming over OOP"), + (0.4894, "The user primarily works with Python and JavaScript"), + ], + "temporal": [ + ( + 0.4674, + "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8", + ), + ( + 0.4548, + "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8", + ), + ( + 0.4518, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8", + ), + ( + 0.4496, + "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8", + ), + ], + }, +} + + +def test_real_observation_embeddings(real_voyage_client, qdrant): + beliefs = [ + ("The user thinks that all men must die.", "All humans are mortal.", "humans"), + ( + "The user believes functional programming leads to better code quality", + "Functional programming produces more maintainable code", + "programming_philosophy", + ), + ( + "The user thinks AI will replace most software developers within 10 years", + "AI will make most programmers obsolete by 2035", + "ai_future", + ), + ( + "The user believes code reviews are essential for quality", + "Code reviews catch bugs that automated testing misses", + "code_quality", + ), + ( + "The user thinks remote work is more productive than office work", + "I get more done working from home", + "work_environment", + ), + ( + "The user believes unit tests are a waste of time for prototypes", + "Writing tests for throwaway code slows development", + "testing_philosophy", + ), + ] + + behaviors = [ + ( + "The user always refactors to pure functions", + "I always refactor to pure functions", + "refactoring", + ), + ( + "The user writes commit messages in present tense", + "Fix bug in parser instead of Fixed bug in parser", + "git_habits", + ), + ( + "The user prefers working late at night", + "I do my best coding between 10pm and 2am", + "work_schedule", + ), + ( + "The user always writes documentation before implementing features", + "I document the API design before writing any code", + "documentation_habits", + ), + ( + "The user debugs by adding print statements rather than using a debugger", + "When debugging, I just add console.log everywhere", + "debugging_approach", + ), + ( + "The user takes breaks every 25 minutes using the Pomodoro technique", + "I use 25-minute work intervals with 5-minute breaks", + "productivity_methods", + ), + ] + + contradictions = [ + ( + "The user said pure functions are yucky", + "Pure functions are yucky", + "pure_functions", + ), + ( + "The user now says they love TypeScript but previously called it verbose", + "TypeScript has too much boilerplate vs TypeScript makes my code so much cleaner", + "typescript_opinion", + ), + ( + "The user claims to prefer tabs but their code uses spaces", + "Tabs are better than spaces vs code consistently uses 2-space indentation", + "indentation_preference", + ), + ] + + preferences = [ + ( + "The user prefers functional programming over OOP", + "I prefer functional programming over OOP", + "programming_paradigms", + ), + ( + "The user prefers Vim over VS Code for editing", + "Vim makes me more productive than any modern editor", + "editor_preference", + ), + ( + "The user prefers working on backend systems over frontend UI", + "I find backend logic more interesting than UI work", + "domain_preference", + ), + ( + "The user prefers small, focused commits over large feature branches", + "I like to commit small, logical changes frequently", + "version_control_style", + ), + ( + "The user prefers PostgreSQL over MongoDB for most applications", + "Relational databases handle complex queries better than document stores", + "database_preference", + ), + ( + "The user prefers pair programming for complex problems", + "Two heads are better than one when solving hard problems", + "collaboration_preference", + ), + ] + + general = [ + ("The user is a human", "The user is a human", "humans"), + ( + "The user has 8 years of professional programming experience", + "I've been coding professionally for 8 years", + "experience_level", + ), + ( + "The user primarily works with Python and JavaScript", + "Most of my work is in Python backend and React frontend", + "primary_languages", + ), + ( + "The user works at a mid-size startup with 50 employees", + "Our company has about 50 people", + "company_size", + ), + ( + "The user graduated with a Computer Science degree from Stanford", + "I studied CS at Stanford", + "education_background", + ), + ( + "The user is currently learning Rust in their spare time", + "I'm picking up Rust on weekends", + "learning_activities", + ), + ] + + ids = itertools.count(1) + items = [ + AgentObservation( + id=next(ids), + content=content, + mime_type="text/html", + modality="observation", + sha256=hashlib.sha256(content.encode("utf-8")).hexdigest(), + size=len(content), + tags=["bla"], + observation_type=observation_type, + subject=subject, + confidence=0.8, + evidence={ + "quote": quote, + "source": "https://en.wikipedia.org/wiki/Human", + }, + agent_model="gpt-4o", + inserted_at=datetime(2025, 1, 1, 12, 0, 0), + embed_status="QUEUED", + ) + for observation_type, observations in [ + ("belief", beliefs), + ("behavior", behaviors), + ("contradiction", contradictions), + ("preference", preferences), + ("general", general), + ] + for content, quote, subject in observations + ] + + for item in items: + embed_source_item(item) + push_to_qdrant(items) + + chunk_map = {str(c.id): c for item in items for c in item.chunks} + + def get_top(vector, search_type: str) -> list[tuple[float, str]]: + results = qdrant_tools.search_vectors(qdrant, search_type, vector) + return [ + (round(i.score, 4), chunk_map[str(i.id)].content) + for i in sorted(results, key=lambda x: x.score, reverse=True) + ][:4] + + for query, expected in EXPECTED_OBSERVATION_RESULTS.items(): + search_vector = embed_text( + [extract.DataChunk(data=[query])], input_type="query" + )[0] + semantic_results = get_top(search_vector, "semantic") + temporal_results = get_top(search_vector, "temporal") + assert semantic_results == expected["semantic"] + assert temporal_results == expected["temporal"] diff --git a/tests/memory/common/db/models/test_source_item.py b/tests/memory/common/db/models/test_source_item.py index 5e12a6c..3759ce2 100644 --- a/tests/memory/common/db/models/test_source_item.py +++ b/tests/memory/common/db/models/test_source_item.py @@ -1,23 +1,17 @@ from sqlalchemy.orm import Session -from unittest.mock import patch, Mock +from unittest.mock import patch from typing import cast import pytest from PIL import Image -from datetime import datetime from memory.common import settings, chunker, extract -from memory.common.db.models.sources import Book from memory.common.db.models.source_items import ( Chunk, MailMessage, - EmailAttachment, - BookSection, - BlogPost, ) from memory.common.db.models.source_item import ( SourceItem, image_filenames, add_pics, - merge_metadata, clean_filename, ) @@ -56,114 +50,6 @@ def test_clean_filename(input_filename, expected): assert clean_filename(input_filename) == expected -@pytest.mark.parametrize( - "dicts,expected", - [ - # Empty input - ([], {}), - # Single dict without tags - ([{"key": "value"}], {"key": "value"}), - # Single dict with tags as list - ( - [{"key": "value", "tags": ["tag1", "tag2"]}], - {"key": "value", "tags": {"tag1", "tag2"}}, - ), - # Single dict with tags as set - ( - [{"key": "value", "tags": {"tag1", "tag2"}}], - {"key": "value", "tags": {"tag1", "tag2"}}, - ), - # Multiple dicts without tags - ( - [{"key1": "value1"}, {"key2": "value2"}], - {"key1": "value1", "key2": "value2"}, - ), - # Multiple dicts with non-overlapping tags - ( - [ - {"key1": "value1", "tags": ["tag1"]}, - {"key2": "value2", "tags": ["tag2"]}, - ], - {"key1": "value1", "key2": "value2", "tags": {"tag1", "tag2"}}, - ), - # Multiple dicts with overlapping tags - ( - [ - {"key1": "value1", "tags": ["tag1", "tag2"]}, - {"key2": "value2", "tags": ["tag2", "tag3"]}, - ], - {"key1": "value1", "key2": "value2", "tags": {"tag1", "tag2", "tag3"}}, - ), - # Overlapping keys - later dict wins - ( - [ - {"key": "value1", "other": "data1"}, - {"key": "value2", "another": "data2"}, - ], - {"key": "value2", "other": "data1", "another": "data2"}, - ), - # Mixed tags types (list and set) - ( - [ - {"key1": "value1", "tags": ["tag1", "tag2"]}, - {"key2": "value2", "tags": {"tag3", "tag4"}}, - ], - { - "key1": "value1", - "key2": "value2", - "tags": {"tag1", "tag2", "tag3", "tag4"}, - }, - ), - # Empty tags - ( - [{"key": "value", "tags": []}, {"key2": "value2", "tags": []}], - {"key": "value", "key2": "value2"}, - ), - # None values - ( - [{"key1": None, "key2": "value"}, {"key3": None}], - {"key1": None, "key2": "value", "key3": None}, - ), - # Complex nested structures - ( - [ - {"nested": {"inner": "value1"}, "list": [1, 2, 3], "tags": ["tag1"]}, - {"nested": {"inner": "value2"}, "list": [4, 5], "tags": ["tag2"]}, - ], - {"nested": {"inner": "value2"}, "list": [4, 5], "tags": {"tag1", "tag2"}}, - ), - # Boolean and numeric values - ( - [ - {"bool": True, "int": 42, "float": 3.14, "tags": ["numeric"]}, - {"bool": False, "int": 100}, - ], - {"bool": False, "int": 100, "float": 3.14, "tags": {"numeric"}}, - ), - # Three or more dicts - ( - [ - {"a": 1, "tags": ["t1"]}, - {"b": 2, "tags": ["t2", "t3"]}, - {"c": 3, "a": 10, "tags": ["t3", "t4"]}, - ], - {"a": 10, "b": 2, "c": 3, "tags": {"t1", "t2", "t3", "t4"}}, - ), - # Dict with only tags - ([{"tags": ["tag1", "tag2"]}], {"tags": {"tag1", "tag2"}}), - # Empty dicts - ([{}, {}], {}), - # Mix of empty and non-empty dicts - ( - [{}, {"key": "value", "tags": ["tag"]}, {}], - {"key": "value", "tags": {"tag"}}, - ), - ], -) -def test_merge_metadata(dicts, expected): - assert merge_metadata(*dicts) == expected - - def test_image_filenames_with_existing_filenames(tmp_path): """Test image_filenames when images already have filenames""" chunk_id = "test_chunk_123" diff --git a/tests/memory/common/db/models/test_source_item_embeddings.py b/tests/memory/common/db/models/test_source_item_embeddings.py new file mode 100644 index 0000000..4fa92dd --- /dev/null +++ b/tests/memory/common/db/models/test_source_item_embeddings.py @@ -0,0 +1,626 @@ +import hashlib +from datetime import datetime +from typing import Sequence, cast +from unittest.mock import ANY, Mock, call + +import pymupdf # PyMuPDF +import pytest + +from memory.common import settings +from memory.common.db.models.source_item import Chunk, SourceItem +from memory.common.db.models.source_items import ( + AgentObservation, + BlogPost, + BookSection, + Comic, + EmailAttachment, + ForumPost, + MailMessage, +) +from memory.common.db.models.sources import Book +from memory.common.embedding import embed_source_item +from memory.common.extract import page_to_image +from tests.data.contents import ( + CHUNKS, + LANG_TIMELINE_HASH, + SAMPLE_MARKDOWN, + SAMPLE_TEXT, + image_hash, +) + + +def compare_chunks( + chunks: Sequence[Chunk], + expected: Sequence[tuple[str | None, list[str], dict]], +): + data = [ + (c.content, [image_hash(i) for i in c.images], c.item_metadata) for c in chunks + ] + assert data == expected + + +def test_base_source_item_text_embeddings(mock_voyage_client): + item = SourceItem( + id=1, + content=SAMPLE_MARKDOWN, + mime_type="text/html", + modality="text", + sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), + size=len(SAMPLE_MARKDOWN), + tags=["bla"], + ) + metadata = item.as_payload() + metadata["tags"] = {"bla"} + expected = [ + (CHUNKS[0].strip(), cast(list[str], []), metadata), + (CHUNKS[1].strip(), cast(list[str], []), metadata), + ("test summary", [], metadata | {"tags": {"tag1", "tag2", "bla"}}), + ] + + mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3)) + mock_voyage_client.multimodal_embed = Mock( + return_value=Mock(embeddings=[[0.1] * 1024] * 3) + ) + compare_chunks(item.data_chunks(), expected) + compare_chunks(embed_source_item(item), expected) + + assert mock_voyage_client.embed.call_count == 1 + assert not mock_voyage_client.multimodal_embed.call_count + + assert mock_voyage_client.embed.call_args == call( + [CHUNKS[0].strip(), CHUNKS[1].strip(), "test summary"], + model=settings.TEXT_EMBEDDING_MODEL, + input_type="document", + ) + + +def test_base_source_item_mixed_embeddings(mock_voyage_client): + item = SourceItem( + id=1, + content=SAMPLE_MARKDOWN, + filename=DATA_DIR / "lang_timeline.png", + mime_type="image/png", + modality="photo", + sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), + size=len(SAMPLE_MARKDOWN), + tags=["bla"], + ) + metadata = item.as_payload() + metadata["tags"] = {"bla"} + expected = [ + (CHUNKS[0].strip(), [], metadata), + (CHUNKS[1].strip(), [], metadata), + ("test summary", [], metadata | {"tags": {"tag1", "tag2", "bla"}}), + (None, [LANG_TIMELINE_HASH], {"size": 3465, "source_id": 1, "tags": {"bla"}}), + ] + + mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3)) + mock_voyage_client.multimodal_embed = Mock( + return_value=Mock(embeddings=[[0.1] * 1024] * 3) + ) + compare_chunks(item.data_chunks(), expected) + compare_chunks(embed_source_item(item), expected) + + assert mock_voyage_client.embed.call_count == 1 + assert mock_voyage_client.multimodal_embed.call_count == 1 + + assert mock_voyage_client.embed.call_args == call( + [CHUNKS[0].strip(), CHUNKS[1].strip(), "test summary"], + model=settings.TEXT_EMBEDDING_MODEL, + input_type="document", + ) + assert mock_voyage_client.multimodal_embed.call_args == call( + [[ANY]], + model=settings.MIXED_EMBEDDING_MODEL, + input_type="document", + ) + assert [ + image_hash(i) for i in mock_voyage_client.multimodal_embed.call_args[0][0][0] + ] == [LANG_TIMELINE_HASH] + + +def test_mail_message_embeddings(mock_voyage_client): + item = MailMessage( + id=1, + content=SAMPLE_MARKDOWN, + mime_type="text/html", + modality="text", + sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), + size=len(SAMPLE_MARKDOWN), + tags=["bla"], + message_id="123", + subject="Test Subject", + sender="test@example.com", + recipients=["test@example.com"], + folder="INBOX", + sent_at=datetime(2025, 1, 1, 12, 0, 0), + ) + metadata = item.as_payload() + metadata["tags"] = {"bla", "test@example.com"} + expected = [ + (CHUNKS[0].strip(), [], metadata), + (CHUNKS[1].strip(), [], metadata), + ( + "test summary", + [], + metadata | {"tags": {"tag1", "tag2", "bla", "test@example.com"}}, + ), + ] + + mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3)) + mock_voyage_client.multimodal_embed = Mock( + return_value=Mock(embeddings=[[0.1] * 1024] * 3) + ) + compare_chunks(item.data_chunks(), expected) + compare_chunks(embed_source_item(item), expected) + + assert mock_voyage_client.embed.call_count == 1 + assert not mock_voyage_client.multimodal_embed.call_count + + assert mock_voyage_client.embed.call_args == call( + [CHUNKS[0].strip(), CHUNKS[1].strip(), "test summary"], + model=settings.TEXT_EMBEDDING_MODEL, + input_type="document", + ) + + +def test_email_attachment_embeddings_text(mock_voyage_client): + item = EmailAttachment( + id=1, + content=SAMPLE_MARKDOWN, + mime_type="text/html", + modality="text", + sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), + size=len(SAMPLE_MARKDOWN), + tags=["bla"], + ) + metadata = item.as_payload() + metadata["tags"] = {"bla"} + expected = [ + (CHUNKS[0].strip(), cast(list[str], []), metadata), + (CHUNKS[1].strip(), cast(list[str], []), metadata), + ( + "test summary", + [], + metadata | {"tags": {"tag1", "tag2", "bla"}}, + ), + ] + + mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3)) + mock_voyage_client.multimodal_embed = Mock( + return_value=Mock(embeddings=[[0.1] * 1024] * 3) + ) + compare_chunks(item.data_chunks(), expected) + compare_chunks(embed_source_item(item), expected) + + assert mock_voyage_client.embed.call_count == 1 + assert not mock_voyage_client.multimodal_embed.call_count + + assert mock_voyage_client.embed.call_args == call( + [CHUNKS[0].strip(), CHUNKS[1].strip(), "test summary"], + model=settings.TEXT_EMBEDDING_MODEL, + input_type="document", + ) + + +def test_email_attachment_embeddings_photo(mock_voyage_client): + item = EmailAttachment( + id=1, + content=SAMPLE_MARKDOWN, + filename=DATA_DIR / "lang_timeline.png", + mime_type="image/png", + modality="photo", + sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), + size=len(SAMPLE_MARKDOWN), + tags=["bla"], + ) + metadata = item.as_payload() + metadata["tags"] = {"bla"} + expected = [ + (None, [LANG_TIMELINE_HASH], metadata), + ] + + mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3)) + mock_voyage_client.multimodal_embed = Mock( + return_value=Mock(embeddings=[[0.1] * 1024] * 3) + ) + compare_chunks(item.data_chunks(), expected) + compare_chunks(embed_source_item(item), expected) + + assert mock_voyage_client.embed.call_count == 0 + assert mock_voyage_client.multimodal_embed.call_count == 1 + + assert mock_voyage_client.multimodal_embed.call_args == call( + [[ANY]], + model=settings.MIXED_EMBEDDING_MODEL, + input_type="document", + ) + assert [ + image_hash(i) for i in mock_voyage_client.multimodal_embed.call_args[0][0][0] + ] == [LANG_TIMELINE_HASH] + + +def test_email_attachment_embeddings_pdf(mock_voyage_client): + item = EmailAttachment( + id=1, + content=SAMPLE_MARKDOWN, + filename=DATA_DIR / "regulamin.pdf", + mime_type="application/pdf", + modality="doc", + sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), + size=len(SAMPLE_MARKDOWN), + tags=["bla"], + ) + metadata = item.as_payload() + metadata["tags"] = {"bla"} + with pymupdf.open(item.filename) as pdf: + expected = [ + ( + None, + [image_hash(page_to_image(page))], + metadata + | { + "page": page.number, + "width": page.rect.width, + "height": page.rect.height, + }, + ) + for page in pdf.pages() + ] + + mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3)) + mock_voyage_client.multimodal_embed = Mock( + return_value=Mock(embeddings=[[0.1] * 1024] * 3) + ) + compare_chunks(item.data_chunks(), expected) + compare_chunks(embed_source_item(item), expected) + + assert mock_voyage_client.embed.call_count == 0 + assert mock_voyage_client.multimodal_embed.call_count == 1 + + assert mock_voyage_client.multimodal_embed.call_args == call( + [[ANY], [ANY]], + model=settings.MIXED_EMBEDDING_MODEL, + input_type="document", + ) + assert [ + [image_hash(a) for a in i] + for i in mock_voyage_client.multimodal_embed.call_args[0][0] + ] == [page for _, page, _ in expected] + + +def test_email_attachment_embeddings_comic(mock_voyage_client): + item = Comic( + id=1, + content=SAMPLE_MARKDOWN, + filename=DATA_DIR / "lang_timeline.png", + mime_type="image/png", + modality="comic", + sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), + size=len(SAMPLE_MARKDOWN), + tags=["bla"], + title="The Evolution of Programming Languages", + author="John Doe", + published=datetime(2025, 1, 1, 12, 0, 0), + volume="1", + issue="1", + page=1, + ) + metadata = item.as_payload() + metadata["tags"] = {"bla"} + expected = [ + ( + "The Evolution of Programming Languages by John Doe", + [LANG_TIMELINE_HASH], + metadata, + ), + ] + + mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3)) + mock_voyage_client.multimodal_embed = Mock( + return_value=Mock(embeddings=[[0.1] * 1024] * 3) + ) + compare_chunks(item.data_chunks(), expected) + compare_chunks(embed_source_item(item), expected) + + assert mock_voyage_client.embed.call_count == 0 + assert mock_voyage_client.multimodal_embed.call_count == 1 + + assert mock_voyage_client.multimodal_embed.call_args == call( + [["The Evolution of Programming Languages by John Doe", ANY]], + model=settings.MIXED_EMBEDDING_MODEL, + input_type="document", + ) + assert ( + image_hash(mock_voyage_client.multimodal_embed.call_args[0][0][0][1]) + == LANG_TIMELINE_HASH + ) + + +def test_book_section_embeddings_single_page(mock_voyage_client): + item = BookSection( + id=1, + content=SAMPLE_MARKDOWN, + mime_type="text/html", + modality="text", + sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), + size=len(SAMPLE_MARKDOWN), + tags=["bla"], + book_id=1, + section_title="The Evolution of Programming Languages", + section_number=1, + section_level=1, + start_page=1, + end_page=1, + pages=[SAMPLE_TEXT], + book=Book( + id=1, + title="Programming Languages", + author="John Doe", + published=datetime(2025, 1, 1, 12, 0, 0), + ), + ) + metadata = item.as_payload() + metadata["tags"] = {"bla"} + expected = [ + (CHUNKS[0].strip(), cast(list[str], []), metadata | {"type": "page"}), + (CHUNKS[1].strip(), cast(list[str], []), metadata | {"type": "page"}), + ( + "test summary", + [], + metadata | {"tags": {"tag1", "tag2", "bla"}, "type": "summary"}, + ), + ] + + mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3)) + mock_voyage_client.multimodal_embed = Mock( + return_value=Mock(embeddings=[[0.1] * 1024] * 3) + ) + compare_chunks(item.data_chunks(), expected) + compare_chunks(embed_source_item(item), expected) + + assert mock_voyage_client.embed.call_count == 1 + assert not mock_voyage_client.multimodal_embed.call_count + + assert mock_voyage_client.embed.call_args == call( + [CHUNKS[0].strip(), CHUNKS[1].strip(), "test summary"], + model=settings.TEXT_EMBEDDING_MODEL, + input_type="document", + ) + + +def test_book_section_embeddings_multiple_pages(mock_voyage_client): + item = BookSection( + id=1, + content=SAMPLE_MARKDOWN + "\n\n" + SECOND_PAGE, + mime_type="text/html", + modality="text", + sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), + size=len(SAMPLE_MARKDOWN), + tags=["bla"], + book_id=1, + section_title="The Evolution of Programming Languages", + section_number=1, + section_level=1, + start_page=1, + end_page=2, + pages=[SAMPLE_TEXT, SECOND_PAGE_TEXT], + book=Book( + id=1, + title="Programming Languages", + author="John Doe", + published=datetime(2025, 1, 1, 12, 0, 0), + ), + ) + metadata = item.as_payload() + metadata["tags"] = {"bla", "tag1", "tag2"} + expected = [ + (item.content.strip(), cast(list[str], []), metadata | {"type": "section"}), + ("test summary", [], metadata | {"type": "summary"}), + ] + + mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3)) + mock_voyage_client.multimodal_embed = Mock( + return_value=Mock(embeddings=[[0.1] * 1024] * 3) + ) + compare_chunks(item.data_chunks(), expected) + compare_chunks(embed_source_item(item), expected) + + assert mock_voyage_client.embed.call_count == 1 + assert not mock_voyage_client.multimodal_embed.call_count + + assert mock_voyage_client.embed.call_args == call( + [item.content.strip(), "test summary"], + model=settings.TEXT_EMBEDDING_MODEL, + input_type="document", + ) + + +@pytest.mark.parametrize( + "class_, modality", + ( + (BlogPost, "blog"), + (ForumPost, "forum"), + ), +) +def test_post_embeddings_single_page(mock_voyage_client, class_, modality): + item = class_( + id=1, + content=SAMPLE_MARKDOWN, + mime_type="text/html", + modality=modality, + sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), + size=len(SAMPLE_MARKDOWN), + tags=["bla"], + images=[LANG_TIMELINE.filename, CODE_COMPLEXITY.filename], # type: ignore + ) + metadata = item.as_payload() + metadata["tags"] = {"bla", "tag1", "tag2"} + expected = [ + (item.content.strip(), [LANG_TIMELINE_HASH, CODE_COMPLEXITY_HASH], metadata), + ] + + mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3)) + mock_voyage_client.multimodal_embed = Mock( + return_value=Mock(embeddings=[[0.1] * 1024] * 3) + ) + compare_chunks(item.data_chunks(), expected) + compare_chunks(embed_source_item(item), expected) + + assert not mock_voyage_client.embed.call_count + assert mock_voyage_client.multimodal_embed.call_count == 1 + + assert mock_voyage_client.multimodal_embed.call_args == call( + [[item.content.strip(), ANY, ANY]], + model=settings.MIXED_EMBEDDING_MODEL, + input_type="document", + ) + assert [ + image_hash(i) + for i in mock_voyage_client.multimodal_embed.call_args[0][0][0][1:] + ] == [LANG_TIMELINE_HASH, CODE_COMPLEXITY_HASH] + + +@pytest.mark.parametrize( + "class_, modality", + ( + (BlogPost, "blog"), + (ForumPost, "forum"), + ), +) +def test_post_embeddings_multi_page(mock_voyage_client, class_, modality): + item = class_( + id=1, + content=SAMPLE_MARKDOWN + "\n\n" + SECOND_PAGE_MARKDOWN, + mime_type="text/html", + modality=modality, + sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), + size=len(SAMPLE_MARKDOWN + SECOND_PAGE_MARKDOWN), + tags=["bla"], + images=[LANG_TIMELINE.filename, CODE_COMPLEXITY.filename], # type: ignore + ) + metadata = item.as_payload() + metadata["tags"] = {"bla", "tag1", "tag2"} + + all_contents = ( + item.content.strip(), + [LANG_TIMELINE_HASH, CODE_COMPLEXITY_HASH], + metadata, + ) + first_chunk = ( + TWO_PAGE_CHUNKS[0].strip(), + [LANG_TIMELINE_HASH, CODE_COMPLEXITY_HASH], + metadata, + ) + second_chunk = (TWO_PAGE_CHUNKS[1].strip(), [], metadata) + third_chunk = (TWO_PAGE_CHUNKS[2].strip(), [], metadata) + summary = ("test summary", [], metadata) + + mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3)) + mock_voyage_client.multimodal_embed = Mock( + return_value=Mock(embeddings=[[0.1] * 1024] * 3) + ) + compare_chunks( + item.data_chunks(), + [all_contents, first_chunk, second_chunk, third_chunk, summary], + ) + # embed_source_item first does text embedding, then mixed embedding + # so the order of chunks is different than in data_chunks() + compare_chunks( + embed_source_item(item), + [ + second_chunk, + third_chunk, + summary, + all_contents, + first_chunk, + ], + ) + + assert mock_voyage_client.embed.call_count == 1 + assert mock_voyage_client.multimodal_embed.call_count == 1 + + assert mock_voyage_client.embed.call_args == call( + [ + TWO_PAGE_CHUNKS[1].strip(), + TWO_PAGE_CHUNKS[2].strip(), + "test summary", + ], + model=settings.TEXT_EMBEDDING_MODEL, + input_type="document", + ) + assert mock_voyage_client.multimodal_embed.call_args == call( + [[item.content.strip(), ANY, ANY], [TWO_PAGE_CHUNKS[0].strip(), ANY, ANY]], + model=settings.MIXED_EMBEDDING_MODEL, + input_type="document", + ) + assert [ + image_hash(i) + for i in mock_voyage_client.multimodal_embed.call_args[0][0][0][1:] + ] == [LANG_TIMELINE_HASH, CODE_COMPLEXITY_HASH] + assert [ + image_hash(i) + for i in mock_voyage_client.multimodal_embed.call_args[0][0][1][1:] + ] == [LANG_TIMELINE_HASH, CODE_COMPLEXITY_HASH] + + +def test_agent_observation_embeddings(mock_voyage_client): + item = AgentObservation( + id=1, + content="The user thinks that all men must die.", + mime_type="text/html", + modality="observation", + sha256=hashlib.sha256(SAMPLE_MARKDOWN.encode("utf-8")).hexdigest(), + size=len(SAMPLE_MARKDOWN), + tags=["bla"], + observation_type="belief", + subject="humans", + confidence=0.8, + evidence={ + "quote": "All humans are mortal.", + "source": "https://en.wikipedia.org/wiki/Human", + }, + agent_model="gpt-4o", + inserted_at=datetime(2025, 1, 1, 12, 0, 0), + ) + metadata = item.as_payload() + metadata["tags"] = {"bla"} + expected = [ + ( + "Subject: humans | Type: belief | Observation: The user thinks that all men must die. | Quote: All humans are mortal.", + [], + metadata | {"embedding_type": "semantic"}, + ), + ( + "Time: 12:00 on Wednesday (afternoon) | Subject: humans | Observation: The user thinks that all men must die. | Confidence: 0.8", + [], + metadata | {"embedding_type": "temporal"}, + ), + ( + "The user thinks that all men must die.", + [], + metadata | {"embedding_type": "semantic"}, + ), + ("All humans are mortal.", [], metadata | {"embedding_type": "semantic"}), + ] + + mock_voyage_client.embed = Mock(return_value=Mock(embeddings=[[0.1] * 1024] * 3)) + mock_voyage_client.multimodal_embed = Mock( + return_value=Mock(embeddings=[[0.1] * 1024] * 3) + ) + compare_chunks(item.data_chunks(), expected) + compare_chunks(embed_source_item(item), expected) + + assert mock_voyage_client.embed.call_count == 1 + assert not mock_voyage_client.multimodal_embed.call_count + + assert mock_voyage_client.embed.call_args == call( + [ + "Subject: humans | Type: belief | Observation: The user thinks that all men must die. | Quote: All humans are mortal.", + "Time: 12:00 on Wednesday (afternoon) | Subject: humans | Observation: The user thinks that all men must die. | Confidence: 0.8", + "The user thinks that all men must die.", + "All humans are mortal.", + ], + model=settings.TEXT_EMBEDDING_MODEL, + input_type="document", + ) diff --git a/tests/memory/common/db/models/test_source_items.py b/tests/memory/common/db/models/test_source_items.py index 1a1bf55..aacd0c9 100644 --- a/tests/memory/common/db/models/test_source_items.py +++ b/tests/memory/common/db/models/test_source_items.py @@ -14,7 +14,6 @@ from memory.common.db.models.source_items import ( BlogPost, AgentObservation, ) -from memory.common.db.models.source_item import merge_metadata @pytest.fixture @@ -356,7 +355,8 @@ def test_book_section_data_chunks(pages, expected_chunks): chunks = book_section.data_chunks() expected = [ - (c, merge_metadata(book_section.as_payload(), m)) for c, m in expected_chunks + (c, extract.merge_metadata(book_section.as_payload(), m)) + for c, m in expected_chunks ] assert [(c.content, c.item_metadata) for c in chunks] == expected for c in chunks: