From e5da3714dec14efc60089e5168db31342e4577db Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Tue, 3 Jun 2025 12:18:20 +0200 Subject: [PATCH] muliple dimemnsions for confidence values --- .../20250603_115642_add_confidences.py | 79 +++ src/memory/api/MCP/tools.py | 6 +- src/memory/common/db/models/__init__.py | 2 + src/memory/common/db/models/source_item.py | 71 +++ src/memory/common/db/models/source_items.py | 13 +- src/memory/common/formatters/observation.py | 3 - src/memory/workers/tasks/notes.py | 9 +- src/memory/workers/tasks/observations.py | 4 +- tests/integration/test_real_queries.py | 600 +++++++++--------- .../db/models/test_source_item_embeddings.py | 6 +- .../common/db/models/test_source_items.py | 27 +- .../common/formatters/test_observation.py | 24 +- .../memory/workers/tasks/test_notes_tasks.py | 17 +- 13 files changed, 499 insertions(+), 362 deletions(-) create mode 100644 db/migrations/versions/20250603_115642_add_confidences.py diff --git a/db/migrations/versions/20250603_115642_add_confidences.py b/db/migrations/versions/20250603_115642_add_confidences.py new file mode 100644 index 0000000..a600bf8 --- /dev/null +++ b/db/migrations/versions/20250603_115642_add_confidences.py @@ -0,0 +1,79 @@ +"""Add confidences + +Revision ID: 152f8b4b52e8 +Revises: ba301527a2eb +Create Date: 2025-06-03 11:56:42.302327 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "152f8b4b52e8" +down_revision: Union[str, None] = "ba301527a2eb" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "confidence_score", + sa.Column("id", sa.BigInteger(), nullable=False), + sa.Column("source_item_id", sa.BigInteger(), nullable=False), + sa.Column("confidence_type", sa.Text(), nullable=False), + sa.Column("score", sa.Numeric(precision=3, scale=2), nullable=False), + sa.CheckConstraint("score >= 0.0 AND score <= 1.0", name="score_range_check"), + sa.ForeignKeyConstraint( + ["source_item_id"], ["source_item.id"], ondelete="CASCADE" + ), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint( + "source_item_id", "confidence_type", name="unique_source_confidence_type" + ), + ) + op.create_index("confidence_score_idx", "confidence_score", ["score"], unique=False) + op.create_index( + "confidence_source_idx", "confidence_score", ["source_item_id"], unique=False + ) + op.create_index( + "confidence_type_idx", "confidence_score", ["confidence_type"], unique=False + ) + op.drop_index("agent_obs_confidence_idx", table_name="agent_observation") + op.drop_column("agent_observation", "confidence") + op.drop_index("note_confidence_idx", table_name="notes") + op.drop_column("notes", "confidence") + + +def downgrade() -> None: + op.add_column( + "notes", + sa.Column( + "confidence", + sa.NUMERIC(precision=3, scale=2), + server_default=sa.text("0.5"), + autoincrement=False, + nullable=False, + ), + ) + op.create_index("note_confidence_idx", "notes", ["confidence"], unique=False) + op.add_column( + "agent_observation", + sa.Column( + "confidence", + sa.NUMERIC(precision=3, scale=2), + server_default=sa.text("0.5"), + autoincrement=False, + nullable=False, + ), + ) + op.create_index( + "agent_obs_confidence_idx", "agent_observation", ["confidence"], unique=False + ) + op.drop_index("confidence_type_idx", table_name="confidence_score") + op.drop_index("confidence_source_idx", table_name="confidence_score") + op.drop_index("confidence_score_idx", table_name="confidence_score") + op.drop_table("confidence_score") diff --git a/src/memory/api/MCP/tools.py b/src/memory/api/MCP/tools.py index c7c24d9..57ee9dd 100644 --- a/src/memory/api/MCP/tools.py +++ b/src/memory/api/MCP/tools.py @@ -654,11 +654,9 @@ async def create_note( """ if filename: path = pathlib.Path(filename) - if path.is_absolute(): - path = path.relative_to(settings.NOTES_STORAGE_DIR) - else: + if not path.is_absolute(): path = pathlib.Path(settings.NOTES_STORAGE_DIR) / path - filename = path.as_posix() + filename = path.relative_to(settings.NOTES_STORAGE_DIR).as_posix() try: task = celery_app.send_task( diff --git a/src/memory/common/db/models/__init__.py b/src/memory/common/db/models/__init__.py index b69fc04..4882b11 100644 --- a/src/memory/common/db/models/__init__.py +++ b/src/memory/common/db/models/__init__.py @@ -2,6 +2,7 @@ from memory.common.db.models.base import Base from memory.common.db.models.source_item import ( Chunk, SourceItem, + ConfidenceScore, clean_filename, ) from memory.common.db.models.source_items import ( @@ -37,6 +38,7 @@ __all__ = [ "Chunk", "clean_filename", "SourceItem", + "ConfidenceScore", "MailMessage", "EmailAttachment", "AgentObservation", diff --git a/src/memory/common/db/models/source_item.py b/src/memory/common/db/models/source_item.py index 6522f64..2062605 100644 --- a/src/memory/common/db/models/source_item.py +++ b/src/memory/common/db/models/source_item.py @@ -22,9 +22,11 @@ from sqlalchemy import ( Text, event, func, + UniqueConstraint, ) from sqlalchemy.dialects.postgresql import BYTEA from sqlalchemy.orm import Session, relationship +from sqlalchemy.types import Numeric from memory.common import settings import memory.common.extract as extract @@ -191,6 +193,41 @@ class Chunk(Base): return items +class ConfidenceScore(Base): + """ + Stores structured confidence scores for source items. + Provides detailed confidence dimensions instead of a single score. + """ + + __tablename__ = "confidence_score" + + id = Column(BigInteger, primary_key=True) + source_item_id = Column( + BigInteger, ForeignKey("source_item.id", ondelete="CASCADE"), nullable=False + ) + confidence_type = Column( + Text, nullable=False + ) # e.g., "observation_accuracy", "interpretation", "predictive_value" + score = Column(Numeric(3, 2), nullable=False) # 0.0-1.0 + + # Relationship back to source item + source_item = relationship("SourceItem", back_populates="confidence_scores") + + __table_args__ = ( + Index("confidence_source_idx", "source_item_id"), + Index("confidence_type_idx", "confidence_type"), + Index("confidence_score_idx", "score"), + CheckConstraint("score >= 0.0 AND score <= 1.0", name="score_range_check"), + # Ensure each source_item can only have one score per confidence_type + UniqueConstraint( + "source_item_id", "confidence_type", name="unique_source_confidence_type" + ), + ) + + def __repr__(self) -> str: + return f"" + + class SourceItem(Base): """Base class for all content in the system using SQLAlchemy's joined table inheritance.""" @@ -216,6 +253,11 @@ class SourceItem(Base): embed_status = Column(Text, nullable=False, server_default="RAW") chunks = relationship("Chunk", backref="source", cascade="all, delete-orphan") + # Confidence scores relationship + confidence_scores = relationship( + "ConfidenceScore", back_populates="source_item", cascade="all, delete-orphan" + ) + # Discriminator column for SQLAlchemy inheritance type = Column(String(50)) @@ -235,6 +277,35 @@ class SourceItem(Base): """Get vector IDs from associated chunks.""" return [chunk.id for chunk in self.chunks] + @property + def confidence_dict(self) -> dict[str, float]: + return { + score.confidence_type: float(score.score) + for score in self.confidence_scores + } + + def update_confidences(self, confidence_updates: dict[str, float]) -> None: + """ + Update confidence scores for this source item. + Merges new scores with existing ones, overwriting duplicates. + + Args: + confidence_updates: Dict mapping confidence_type to score (0.0-1.0) + """ + if not confidence_updates: + return + + current = {s.confidence_type: s for s in self.confidence_scores} + + for confidence_type, score in confidence_updates.items(): + if current_score := current.get(confidence_type): + current_score.score = score + else: + new_score = ConfidenceScore( + source_item_id=self.id, confidence_type=confidence_type, score=score + ) + self.confidence_scores.append(new_score) + def _chunk_contents(self) -> Sequence[extract.DataChunk]: content = cast(str | None, self.content) if content: diff --git a/src/memory/common/db/models/source_items.py b/src/memory/common/db/models/source_items.py index b8cf52a..3cdbdc5 100644 --- a/src/memory/common/db/models/source_items.py +++ b/src/memory/common/db/models/source_items.py @@ -505,7 +505,6 @@ class Note(SourceItem): ) note_type = Column(Text, nullable=True) subject = Column(Text, nullable=True) - confidence = Column(Numeric(3, 2), nullable=False, default=0.5) # 0.0-1.0 __mapper_args__ = { "polymorphic_identity": "note", @@ -514,7 +513,6 @@ class Note(SourceItem): __table_args__ = ( Index("note_type_idx", "note_type"), Index("note_subject_idx", "subject"), - Index("note_confidence_idx", "confidence"), ) def as_payload(self) -> dict: @@ -522,7 +520,7 @@ class Note(SourceItem): **super().as_payload(), "note_type": self.note_type, "subject": self.subject, - "confidence": float(cast(Any, self.confidence)), + "confidence": self.confidence_dict, } @property @@ -531,7 +529,7 @@ class Note(SourceItem): "subject": self.subject, "content": self.content, "note_type": self.note_type, - "confidence": self.confidence, + "confidence": self.confidence_dict, "tags": self.tags, } @@ -573,7 +571,6 @@ class AgentObservation(SourceItem): Text, nullable=False ) # belief, preference, pattern, contradiction, behavior subject = Column(Text, nullable=False) # What/who the observation is about - confidence = Column(Numeric(3, 2), nullable=False, default=0.8) # 0.0-1.0 evidence = Column(JSONB) # Supporting context, quotes, etc. agent_model = Column(Text, nullable=False) # Which AI model made this observation @@ -599,7 +596,6 @@ class AgentObservation(SourceItem): Index("agent_obs_session_idx", "session_id"), Index("agent_obs_type_idx", "observation_type"), Index("agent_obs_subject_idx", "subject"), - Index("agent_obs_confidence_idx", "confidence"), Index("agent_obs_model_idx", "agent_model"), ) @@ -613,7 +609,7 @@ class AgentObservation(SourceItem): **super().as_payload(), "observation_type": self.observation_type, "subject": self.subject, - "confidence": float(cast(Any, self.confidence)), + "confidence": self.confidence_dict, "evidence": self.evidence, "agent_model": self.agent_model, } @@ -633,7 +629,7 @@ class AgentObservation(SourceItem): "content": self.content, "observation_type": self.observation_type, "evidence": self.evidence, - "confidence": self.confidence, + "confidence": self.confidence_dict, "agent_model": self.agent_model, "tags": self.tags, } @@ -664,7 +660,6 @@ class AgentObservation(SourceItem): temporal_text = observation.generate_temporal_text( cast(str, self.subject), cast(str, self.content), - cast(float, self.confidence), cast(datetime, self.inserted_at), ) if temporal_text: diff --git a/src/memory/common/formatters/observation.py b/src/memory/common/formatters/observation.py index c89d37a..a644df5 100644 --- a/src/memory/common/formatters/observation.py +++ b/src/memory/common/formatters/observation.py @@ -31,7 +31,6 @@ def generate_semantic_text( def generate_temporal_text( subject: str, content: str, - confidence: float, created_at: datetime, ) -> str: """Generate text with temporal context for time-pattern search.""" @@ -55,8 +54,6 @@ def generate_temporal_text( f"Subject: {subject}", f"Observation: {content}", ] - if confidence is not None: - parts.append(f"Confidence: {confidence}") return " | ".join(parts) diff --git a/src/memory/workers/tasks/notes.py b/src/memory/workers/tasks/notes.py index b76c284..8941ca4 100644 --- a/src/memory/workers/tasks/notes.py +++ b/src/memory/workers/tasks/notes.py @@ -1,7 +1,6 @@ import logging import pathlib -from memory.common import settings from memory.common.db.connection import make_session from memory.common.db.models import Note from memory.common.celery_app import app, SYNC_NOTE, SYNC_NOTES @@ -23,7 +22,7 @@ def sync_note( content: str, filename: str | None = None, note_type: str | None = None, - confidence: float | None = None, + confidences: dict[str, float] = {}, tags: list[str] = [], ): logger.info(f"Syncing note {subject}") @@ -32,6 +31,8 @@ def sync_note( if filename: filename = filename.lstrip("/") + if not filename.endswith(".md"): + filename = f"{filename}.md" with make_session() as session: existing_note = check_content_exists(session, Note, sha256=sha256) @@ -45,7 +46,6 @@ def sync_note( note = Note( modality="note", mime_type="text/markdown", - confidence=confidence or 0.5, ) else: logger.info("Editing preexisting note") @@ -58,11 +58,10 @@ def sync_note( if note_type: note.note_type = note_type # type: ignore - if confidence: - note.confidence = confidence # type: ignore if tags: note.tags = tags # type: ignore + note.update_confidences(confidences) note.save_to_file() return process_content_item(note, session) diff --git a/src/memory/workers/tasks/observations.py b/src/memory/workers/tasks/observations.py index ef6c096..c5d95fc 100644 --- a/src/memory/workers/tasks/observations.py +++ b/src/memory/workers/tasks/observations.py @@ -21,7 +21,7 @@ def sync_observation( content: str, observation_type: str, evidence: dict | None = None, - confidence: float = 0.5, + confidences: dict[str, float] = {}, session_id: str | None = None, agent_model: str = "unknown", tags: list[str] = [], @@ -33,7 +33,6 @@ def sync_observation( content=content, subject=subject, observation_type=observation_type, - confidence=confidence, evidence=evidence, tags=tags or [], session_id=session_id, @@ -43,6 +42,7 @@ def sync_observation( sha256=sha256, modality="observation", ) + observation.update_confidences(confidences) with make_session() as session: existing_observation = check_content_exists( diff --git a/tests/integration/test_real_queries.py b/tests/integration/test_real_queries.py index 36f0415..cdf3cad 100644 --- a/tests/integration/test_real_queries.py +++ b/tests/integration/test_real_queries.py @@ -98,64 +98,64 @@ EXPECTED_OBSERVATION_RESULTS = { 0.7104, "The user believes functional programming leads to better code quality", ), - (0.6792, "I prefer functional programming over OOP"), + (0.6788, "I prefer functional programming over OOP"), ( - 0.6772, + 0.6759, "Subject: programming_philosophy | Type: belief | Observation: The user believes functional programming leads to better code quality | Quote: Functional programming produces more maintainable code", ), ( - 0.6677, + 0.6678, "Subject: programming_paradigms | Type: preference | Observation: The user prefers functional programming over OOP | Quote: I prefer functional programming over OOP", ), ], "temporal": [ ( - 0.5816, - "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + 0.5971, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality", ), ( - 0.5246, - "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8", + 0.5308, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP", ), ( - 0.5214, - "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8", + 0.5167, + "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky", ), ( - 0.4645, - "Time: 12:00 on Wednesday (afternoon) | Subject: refactoring | Observation: The user always refactors to pure functions | Confidence: 0.8", + 0.4702, + "Time: 12:00 on Wednesday (afternoon) | Subject: refactoring | Observation: The user always refactors to pure functions", ), ], }, "Does the user prefer functional or object-oriented programming?": { "semantic": [ - (0.7718, "The user prefers functional programming over OOP"), + (0.7719, "The user prefers functional programming over OOP"), ( - 0.754, + 0.7541, "Subject: programming_paradigms | Type: preference | Observation: The user prefers functional programming over OOP | Quote: I prefer functional programming over OOP", ), - (0.7454, "I prefer functional programming over OOP"), + (0.7455, "I prefer functional programming over OOP"), ( - 0.6541, - "Subject: programming_philosophy | Type: belief | Observation: The user believes functional programming leads to better code quality | Quote: Functional programming produces more maintainable code", + 0.6536, + "The user believes functional programming leads to better code quality", ), ], "temporal": [ ( - 0.6188, - "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8", + 0.6251, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP", ), ( - 0.5902, - "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + 0.6062, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality", ), ( - 0.5144, - "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8", + 0.5061, + "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky", ), ( - 0.4989, - "Time: 12:00 on Wednesday (afternoon) | Subject: refactoring | Observation: The user always refactors to pure functions | Confidence: 0.8", + 0.5036, + "Time: 12:00 on Wednesday (afternoon) | Subject: refactoring | Observation: The user always refactors to pure functions", ), ], }, @@ -163,194 +163,194 @@ EXPECTED_OBSERVATION_RESULTS = { "semantic": [ (0.6925, "The user believes code reviews are essential for quality"), ( - 0.68, + 0.6801, "The user believes functional programming leads to better code quality", ), ( - 0.6524, + 0.6525, "Subject: code_quality | Type: belief | Observation: The user believes code reviews are essential for quality | Quote: Code reviews catch bugs that automated testing misses", ), ( - 0.6466, + 0.6471, "Subject: programming_philosophy | Type: belief | Observation: The user believes functional programming leads to better code quality | Quote: Functional programming produces more maintainable code", ), ], "temporal": [ ( - 0.5544, - "Time: 12:00 on Wednesday (afternoon) | Subject: code_quality | Observation: The user believes code reviews are essential for quality | Confidence: 0.8", + 0.5269, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality", ), ( - 0.5397, - "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + 0.5193, + "Time: 12:00 on Wednesday (afternoon) | Subject: code_quality | Observation: The user believes code reviews are essential for quality", ), ( - 0.4931, - "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8", + 0.468, + "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes", ), ( - 0.4674, - "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8", + 0.4377, + "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky", ), ], }, "How does the user approach debugging code?": { "semantic": [ ( - 0.7011, + 0.7007, "Subject: debugging_approach | Type: behavior | Observation: The user debugs by adding print statements rather than using a debugger | Quote: When debugging, I just add console.log everywhere", ), ( - 0.6962, + 0.6956, "The user debugs by adding print statements rather than using a debugger", ), - (0.6788, "When debugging, I just add console.log everywhere"), + (0.6795, "When debugging, I just add console.log everywhere"), ( - 0.5357, + 0.5352, "Subject: code_quality | Type: belief | Observation: The user believes code reviews are essential for quality | Quote: Code reviews catch bugs that automated testing misses", ), ], "temporal": [ ( - 0.6252, - "Time: 12:00 on Wednesday (afternoon) | Subject: debugging_approach | Observation: The user debugs by adding print statements rather than using a debugger | Confidence: 0.8", + 0.6253, + "Time: 12:00 on Wednesday (afternoon) | Subject: debugging_approach | Observation: The user debugs by adding print statements rather than using a debugger", ), ( - 0.476, - "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8", + 0.48, + "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces", ), ( - 0.4424, - "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches | Confidence: 0.8", + 0.4589, + "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes", ), ( - 0.4402, - "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8", + 0.4502, + "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches", ), ], }, "What are the user's git and version control habits?": { "semantic": [ ( - 0.6474, + 0.6485, "Subject: version_control_style | Type: preference | Observation: The user prefers small, focused commits over large feature branches | Quote: I like to commit small, logical changes frequently", ), - (0.6424, "I like to commit small, logical changes frequently"), + (0.643, "I like to commit small, logical changes frequently"), ( - 0.5961, + 0.5968, "The user prefers small, focused commits over large feature branches", ), ( - 0.5806, + 0.5813, "Subject: git_habits | Type: behavior | Observation: The user writes commit messages in present tense | Quote: Fix bug in parser instead of Fixed bug in parser", ), ], "temporal": [ ( - 0.6174, - "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches | Confidence: 0.8", + 0.6063, + "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches", ), ( - 0.5733, - "Time: 12:00 on Wednesday (afternoon) | Subject: git_habits | Observation: The user writes commit messages in present tense | Confidence: 0.8", + 0.5569, + "Time: 12:00 on Wednesday (afternoon) | Subject: git_habits | Observation: The user writes commit messages in present tense", ), ( - 0.4848, - "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8", + 0.4806, + "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing", ), ( - 0.4604, - "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8", + 0.4622, + "Time: 12:00 on Wednesday (afternoon) | Subject: code_quality | Observation: The user believes code reviews are essential for quality", ), ], }, "When does the user prefer to work?": { "semantic": [ - (0.6806, "The user prefers working late at night"), + (0.6805, "The user prefers working late at night"), ( - 0.6792, + 0.6794, "Subject: work_schedule | Type: behavior | Observation: The user prefers working late at night | Quote: I do my best coding between 10pm and 2am", ), - (0.6439, "I do my best coding between 10pm and 2am"), - (0.5528, "I use 25-minute work intervals with 5-minute breaks"), + (0.6432, "I do my best coding between 10pm and 2am"), + (0.5525, "I use 25-minute work intervals with 5-minute breaks"), ], "temporal": [ ( - 0.7023, - "Time: 12:00 on Wednesday (afternoon) | Subject: work_schedule | Observation: The user prefers working late at night | Confidence: 0.8", + 0.6896, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_schedule | Observation: The user prefers working late at night", ), ( - 0.6395, - "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + 0.6327, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI", ), ( - 0.6375, - "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8", + 0.6266, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work", ), ( - 0.6254, - "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8", + 0.6206, + "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems", ), ], }, "How does the user handle productivity and time management?": { "semantic": [ ( - 0.579, + 0.5795, "Subject: productivity_methods | Type: behavior | Observation: The user takes breaks every 25 minutes using the Pomodoro technique | Quote: I use 25-minute work intervals with 5-minute breaks", ), - (0.5731, "I use 25-minute work intervals with 5-minute breaks"), + (0.5727, "I use 25-minute work intervals with 5-minute breaks"), ( - 0.5284, + 0.5282, "The user takes breaks every 25 minutes using the Pomodoro technique", ), - (0.5153, "I do my best coding between 10pm and 2am"), + (0.515, "I do my best coding between 10pm and 2am"), ], "temporal": [ ( - 0.5705, - "Time: 12:00 on Wednesday (afternoon) | Subject: productivity_methods | Observation: The user takes breaks every 25 minutes using the Pomodoro technique | Confidence: 0.8", + 0.5633, + "Time: 12:00 on Wednesday (afternoon) | Subject: productivity_methods | Observation: The user takes breaks every 25 minutes using the Pomodoro technique", ), ( - 0.5023, - "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8", + 0.5105, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work", ), ( - 0.4631, - "Time: 12:00 on Wednesday (afternoon) | Subject: work_schedule | Observation: The user prefers working late at night | Confidence: 0.8", + 0.4737, + "Time: 12:00 on Wednesday (afternoon) | Subject: documentation_habits | Observation: The user always writes documentation before implementing features", ), ( - 0.4626, - "Time: 12:00 on Wednesday (afternoon) | Subject: documentation_habits | Observation: The user always writes documentation before implementing features | Confidence: 0.8", + 0.4672, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_schedule | Observation: The user prefers working late at night", ), ], }, "What editor does the user prefer?": { "semantic": [ ( - 0.6394, + 0.6398, "Subject: editor_preference | Type: preference | Observation: The user prefers Vim over VS Code for editing | Quote: Vim makes me more productive than any modern editor", ), - (0.6241, "The user prefers Vim over VS Code for editing"), - (0.5528, "Vim makes me more productive than any modern editor"), + (0.6242, "The user prefers Vim over VS Code for editing"), + (0.5524, "Vim makes me more productive than any modern editor"), (0.4887, "The user claims to prefer tabs but their code uses spaces"), ], "temporal": [ ( - 0.5701, - "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8", + 0.5626, + "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing", ), ( - 0.4557, - "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8", + 0.4507, + "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces", ), ( - 0.4322, - "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + 0.4333, + "Time: 12:00 on Wednesday (afternoon) | Subject: database_preference | Observation: The user prefers PostgreSQL over MongoDB for most applications", ), ( - 0.4283, - "Time: 12:00 on Wednesday (afternoon) | Subject: database_preference | Observation: The user prefers PostgreSQL over MongoDB for most applications | Confidence: 0.8", + 0.4307, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI", ), ], }, @@ -360,247 +360,247 @@ EXPECTED_OBSERVATION_RESULTS = { 0.6328, "Subject: database_preference | Type: preference | Observation: The user prefers PostgreSQL over MongoDB for most applications | Quote: Relational databases handle complex queries better than document stores", ), - (0.5992, "The user prefers PostgreSQL over MongoDB for most applications"), + (0.5991, "The user prefers PostgreSQL over MongoDB for most applications"), ( - 0.5352, + 0.5357, "Subject: domain_preference | Type: preference | Observation: The user prefers working on backend systems over frontend UI | Quote: I find backend logic more interesting than UI work", ), - (0.5186, "The user prefers working on backend systems over frontend UI"), + (0.5178, "The user prefers working on backend systems over frontend UI"), ], "temporal": [ ( - 0.5599, - "Time: 12:00 on Wednesday (afternoon) | Subject: database_preference | Observation: The user prefers PostgreSQL over MongoDB for most applications | Confidence: 0.8", + 0.5503, + "Time: 12:00 on Wednesday (afternoon) | Subject: database_preference | Observation: The user prefers PostgreSQL over MongoDB for most applications", ), ( - 0.4617, - "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + 0.4583, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI", ), ( 0.4445, - "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8", + "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript", ), ( - 0.4365, - "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8", + 0.427, + "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing", ), ], }, "What programming languages does the user work with?": { "semantic": [ - (0.7255, "The user primarily works with Python and JavaScript"), - (0.6954, "Most of my work is in Python backend and React frontend"), + (0.7264, "The user primarily works with Python and JavaScript"), + (0.6958, "Most of my work is in Python backend and React frontend"), ( - 0.6874, + 0.6875, "Subject: primary_languages | Type: general | Observation: The user primarily works with Python and JavaScript | Quote: Most of my work is in Python backend and React frontend", ), - (0.6098, "I'm picking up Rust on weekends"), + (0.6111, "I'm picking up Rust on weekends"), ], "temporal": [ ( - 0.5939, - "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8", + 0.5774, + "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript", ), ( - 0.4679, - "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience | Confidence: 0.8", + 0.4692, + "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience", ), ( - 0.4623, - "Time: 12:00 on Wednesday (afternoon) | Subject: learning_activities | Observation: The user is currently learning Rust in their spare time | Confidence: 0.8", + 0.454, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality", ), ( - 0.4514, - "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + 0.4475, + "Time: 12:00 on Wednesday (afternoon) | Subject: learning_activities | Observation: The user is currently learning Rust in their spare time", ), ], }, "What is the user's programming experience level?": { "semantic": [ - (0.6664, "The user has 8 years of professional programming experience"), + (0.6663, "The user has 8 years of professional programming experience"), ( - 0.6565, + 0.6562, "Subject: experience_level | Type: general | Observation: The user has 8 years of professional programming experience | Quote: I've been coding professionally for 8 years", ), - (0.5949, "I've been coding professionally for 8 years"), - (0.5641, "The user is currently learning Rust in their spare time"), + (0.5952, "I've been coding professionally for 8 years"), + (0.5656, "The user is currently learning Rust in their spare time"), ], "temporal": [ ( - 0.5991, - "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience | Confidence: 0.8", + 0.5808, + "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience", ), ( - 0.5041, - "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8", + 0.4814, + "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript", ), ( - 0.4917, - "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + 0.4752, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality", ), ( - 0.4817, - "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8", + 0.4591, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP", ), ], }, "Where did the user study computer science?": { "semantic": [ - (0.6863, "I studied CS at Stanford"), - (0.649, "The user graduated with a Computer Science degree from Stanford"), + (0.686, "I studied CS at Stanford"), + (0.6484, "The user graduated with a Computer Science degree from Stanford"), ( - 0.6344, + 0.6346, "Subject: education_background | Type: general | Observation: The user graduated with a Computer Science degree from Stanford | Quote: I studied CS at Stanford", ), - (0.4592, "The user is currently learning Rust in their spare time"), + (0.4599, "The user is currently learning Rust in their spare time"), ], "temporal": [ ( - 0.5455, - "Time: 12:00 on Wednesday (afternoon) | Subject: education_background | Observation: The user graduated with a Computer Science degree from Stanford | Confidence: 0.8", + 0.5288, + "Time: 12:00 on Wednesday (afternoon) | Subject: education_background | Observation: The user graduated with a Computer Science degree from Stanford", ), ( - 0.3842, - "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience | Confidence: 0.8", + 0.3833, + "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience", ), ( - 0.3792, - "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8", + 0.3728, + "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript", ), ( - 0.3781, - "Time: 12:00 on Wednesday (afternoon) | Subject: learning_activities | Observation: The user is currently learning Rust in their spare time | Confidence: 0.8", + 0.3651, + "Time: 12:00 on Wednesday (afternoon) | Subject: learning_activities | Observation: The user is currently learning Rust in their spare time", ), ], }, "What kind of company does the user work at?": { "semantic": [ - (0.6308, "The user works at a mid-size startup with 50 employees"), + (0.6304, "The user works at a mid-size startup with 50 employees"), ( - 0.5371, + 0.5369, "Subject: company_size | Type: general | Observation: The user works at a mid-size startup with 50 employees | Quote: Our company has about 50 people", ), - (0.5253, "Most of my work is in Python backend and React frontend"), - (0.4902, "I've been coding professionally for 8 years"), + (0.5258, "Most of my work is in Python backend and React frontend"), + (0.4905, "I've been coding professionally for 8 years"), ], "temporal": [ ( - 0.5309, - "Time: 12:00 on Wednesday (afternoon) | Subject: company_size | Observation: The user works at a mid-size startup with 50 employees | Confidence: 0.8", + 0.5194, + "Time: 12:00 on Wednesday (afternoon) | Subject: company_size | Observation: The user works at a mid-size startup with 50 employees", ), ( - 0.4329, - "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8", + 0.4149, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work", ), ( - 0.4323, - "Time: 12:00 on Wednesday (afternoon) | Subject: education_background | Observation: The user graduated with a Computer Science degree from Stanford | Confidence: 0.8", + 0.4144, + "Time: 12:00 on Wednesday (afternoon) | Subject: education_background | Observation: The user graduated with a Computer Science degree from Stanford", ), ( - 0.419, - "Time: 12:00 on Wednesday (afternoon) | Subject: work_schedule | Observation: The user prefers working late at night | Confidence: 0.8", + 0.4053, + "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience", ), ], }, "What does the user think about AI replacing programmers?": { "semantic": [ ( - 0.5965, + 0.5955, "Subject: ai_future | Type: belief | Observation: The user thinks AI will replace most software developers within 10 years | Quote: AI will make most programmers obsolete by 2035", ), + (0.5725, "AI will make most programmers obsolete by 2035"), ( 0.572, "The user thinks AI will replace most software developers within 10 years", ), - (0.5715, "AI will make most programmers obsolete by 2035"), ( - 0.4344, + 0.4342, "The user believes functional programming leads to better code quality", ), ], "temporal": [ ( - 0.4629, - "Time: 12:00 on Wednesday (afternoon) | Subject: ai_future | Observation: The user thinks AI will replace most software developers within 10 years | Confidence: 0.8", + 0.4546, + "Time: 12:00 on Wednesday (afternoon) | Subject: ai_future | Observation: The user thinks AI will replace most software developers within 10 years", ), ( - 0.362, - "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + 0.3583, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality", ), ( - 0.3308, - "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8", + 0.3264, + "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose", ), ( - 0.328, - "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8", + 0.3257, + "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes", ), ], }, "What are the user's views on artificial intelligence?": { "semantic": [ ( - 0.5885, + 0.5884, "Subject: ai_future | Type: belief | Observation: The user thinks AI will replace most software developers within 10 years | Quote: AI will make most programmers obsolete by 2035", ), ( - 0.5661, + 0.5659, "The user thinks AI will replace most software developers within 10 years", ), - (0.5133, "AI will make most programmers obsolete by 2035"), + (0.5139, "AI will make most programmers obsolete by 2035"), (0.4927, "I find backend logic more interesting than UI work"), ], "temporal": [ ( - 0.5399, - "Time: 12:00 on Wednesday (afternoon) | Subject: ai_future | Observation: The user thinks AI will replace most software developers within 10 years | Confidence: 0.8", + 0.5205, + "Time: 12:00 on Wednesday (afternoon) | Subject: ai_future | Observation: The user thinks AI will replace most software developers within 10 years", ), ( - 0.4353, - "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + 0.4203, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality", ), ( - 0.4223, - "Time: 12:00 on Wednesday (afternoon) | Subject: humans | Observation: The user thinks that all men must die. | Confidence: 0.8", + 0.4007, + "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky", ), ( - 0.4219, - "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8", + 0.4001, + "Time: 12:00 on Wednesday (afternoon) | Subject: humans | Observation: The user thinks that all men must die.", ), ], }, "Has the user changed their mind about TypeScript?": { "semantic": [ ( - 0.6174, + 0.6166, "The user now says they love TypeScript but previously called it verbose", ), ( - 0.5757, + 0.5764, "Subject: typescript_opinion | Type: contradiction | Observation: The user now says they love TypeScript but previously called it verbose | Quote: TypeScript has too much boilerplate vs TypeScript makes my code so much cleaner", ), ( - 0.4924, + 0.4907, "TypeScript has too much boilerplate vs TypeScript makes my code so much cleaner", ), - (0.4157, "The user always refactors to pure functions"), + (0.4159, "The user always refactors to pure functions"), ], "temporal": [ ( - 0.5631, - "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8", + 0.5663, + "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose", ), ( - 0.4016, - "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8", + 0.3897, + "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces", ), ( - 0.3827, - "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8", + 0.3833, + "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript", ), ( - 0.3825, - "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8", + 0.3761, + "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing", ), ], }, @@ -612,168 +612,168 @@ EXPECTED_OBSERVATION_RESULTS = { "Subject: indentation_preference | Type: contradiction | Observation: The user claims to prefer tabs but their code uses spaces | Quote: Tabs are better than spaces vs code consistently uses 2-space indentation", ), ( - 0.5321, + 0.5328, "Subject: pure_functions | Type: contradiction | Observation: The user said pure functions are yucky | Quote: Pure functions are yucky", ), ( - 0.5058, + 0.507, "Subject: typescript_opinion | Type: contradiction | Observation: The user now says they love TypeScript but previously called it verbose | Quote: TypeScript has too much boilerplate vs TypeScript makes my code so much cleaner", ), ], "temporal": [ ( - 0.4763, - "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8", + 0.4671, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI", ), ( - 0.4693, - "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + 0.4661, + "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces", ), ( - 0.4681, - "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8", + 0.4566, + "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky", ), ( - 0.4586, - "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8", + 0.4553, + "Time: 12:00 on Wednesday (afternoon) | Subject: database_preference | Observation: The user prefers PostgreSQL over MongoDB for most applications", ), ], }, "What does the user think about software testing?": { "semantic": [ ( - 0.6386, + 0.6384, "Subject: testing_philosophy | Type: belief | Observation: The user believes unit tests are a waste of time for prototypes | Quote: Writing tests for throwaway code slows development", ), - (0.6222, "The user believes unit tests are a waste of time for prototypes"), + (0.6219, "The user believes unit tests are a waste of time for prototypes"), ( - 0.6152, + 0.6154, "Subject: code_quality | Type: belief | Observation: The user believes code reviews are essential for quality | Quote: Code reviews catch bugs that automated testing misses", ), - (0.6036, "The user believes code reviews are essential for quality"), + (0.6031, "The user believes code reviews are essential for quality"), ], "temporal": [ ( - 0.5881, - "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8", + 0.568, + "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes", ), ( - 0.5074, - "Time: 12:00 on Wednesday (afternoon) | Subject: code_quality | Observation: The user believes code reviews are essential for quality | Confidence: 0.8", + 0.4901, + "Time: 12:00 on Wednesday (afternoon) | Subject: code_quality | Observation: The user believes code reviews are essential for quality", ), ( - 0.4863, - "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + 0.4745, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality", ), ( - 0.4748, - "Time: 12:00 on Wednesday (afternoon) | Subject: debugging_approach | Observation: The user debugs by adding print statements rather than using a debugger | Confidence: 0.8", + 0.4524, + "Time: 12:00 on Wednesday (afternoon) | Subject: debugging_approach | Observation: The user debugs by adding print statements rather than using a debugger", ), ], }, "How does the user approach documentation?": { "semantic": [ ( - 0.5966, + 0.597, "Subject: documentation_habits | Type: behavior | Observation: The user always writes documentation before implementing features | Quote: I document the API design before writing any code", ), ( - 0.5473, + 0.5462, "The user always writes documentation before implementing features", ), - (0.5207, "I document the API design before writing any code"), + (0.5213, "I document the API design before writing any code"), ( - 0.4954, + 0.4949, "Subject: debugging_approach | Type: behavior | Observation: The user debugs by adding print statements rather than using a debugger | Quote: When debugging, I just add console.log everywhere", ), ], "temporal": [ ( - 0.4988, - "Time: 12:00 on Wednesday (afternoon) | Subject: documentation_habits | Observation: The user always writes documentation before implementing features | Confidence: 0.8", + 0.5001, + "Time: 12:00 on Wednesday (afternoon) | Subject: documentation_habits | Observation: The user always writes documentation before implementing features", ), ( - 0.4335, - "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces | Confidence: 0.8", + 0.4371, + "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches", ), ( - 0.4316, - "Time: 12:00 on Wednesday (afternoon) | Subject: debugging_approach | Observation: The user debugs by adding print statements rather than using a debugger | Confidence: 0.8", + 0.4355, + "Time: 12:00 on Wednesday (afternoon) | Subject: indentation_preference | Observation: The user claims to prefer tabs but their code uses spaces", ), ( - 0.4307, - "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + 0.4347, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI", ), ], }, "What are the user's collaboration preferences?": { "semantic": [ ( - 0.651, + 0.6516, "Subject: collaboration_preference | Type: preference | Observation: The user prefers pair programming for complex problems | Quote: Two heads are better than one when solving hard problems", ), - (0.5848, "The user prefers pair programming for complex problems"), + (0.5855, "The user prefers pair programming for complex problems"), ( - 0.5355, + 0.5361, "Subject: version_control_style | Type: preference | Observation: The user prefers small, focused commits over large feature branches | Quote: I like to commit small, logical changes frequently", ), ( - 0.5216, + 0.522, "Subject: domain_preference | Type: preference | Observation: The user prefers working on backend systems over frontend UI | Quote: I find backend logic more interesting than UI work", ), ], "temporal": [ ( - 0.6027, - "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8", + 0.5889, + "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems", ), ( - 0.5101, - "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches | Confidence: 0.8", + 0.502, + "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches", ), ( - 0.482, - "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + 0.4754, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI", ), ( - 0.4782, - "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8", + 0.4638, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work", ), ], }, "What does the user think about remote work?": { "semantic": [ - (0.7063, "The user thinks remote work is more productive than office work"), + (0.7054, "The user thinks remote work is more productive than office work"), ( - 0.6583, + 0.6581, "Subject: work_environment | Type: belief | Observation: The user thinks remote work is more productive than office work | Quote: I get more done working from home", ), - (0.6032, "I get more done working from home"), - (0.4997, "The user prefers working on backend systems over frontend UI"), + (0.6026, "I get more done working from home"), + (0.4991, "The user prefers working on backend systems over frontend UI"), ], "temporal": [ ( - 0.5934, - "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8", + 0.5832, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work", ), ( - 0.4173, - "Time: 12:00 on Wednesday (afternoon) | Subject: work_schedule | Observation: The user prefers working late at night | Confidence: 0.8", + 0.4126, + "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes", ), ( - 0.4148, - "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8", + 0.4122, + "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems", ), ( - 0.4121, - "Time: 12:00 on Wednesday (afternoon) | Subject: testing_philosophy | Observation: The user believes unit tests are a waste of time for prototypes | Confidence: 0.8", + 0.4092, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI", ), ], }, "What are the user's productivity methods?": { "semantic": [ ( - 0.5723, + 0.5729, "Subject: productivity_methods | Type: behavior | Observation: The user takes breaks every 25 minutes using the Pomodoro technique | Quote: I use 25-minute work intervals with 5-minute breaks", ), ( @@ -781,140 +781,140 @@ EXPECTED_OBSERVATION_RESULTS = { "The user takes breaks every 25 minutes using the Pomodoro technique", ), (0.5205, "I use 25-minute work intervals with 5-minute breaks"), - (0.5107, "The user thinks remote work is more productive than office work"), + (0.512, "The user thinks remote work is more productive than office work"), ], "temporal": [ ( - 0.5427, - "Time: 12:00 on Wednesday (afternoon) | Subject: productivity_methods | Observation: The user takes breaks every 25 minutes using the Pomodoro technique | Confidence: 0.8", + 0.5312, + "Time: 12:00 on Wednesday (afternoon) | Subject: productivity_methods | Observation: The user takes breaks every 25 minutes using the Pomodoro technique", ), ( - 0.4743, - "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work | Confidence: 0.8", + 0.4796, + "Time: 12:00 on Wednesday (afternoon) | Subject: work_environment | Observation: The user thinks remote work is more productive than office work", ), ( - 0.4299, - "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8", + 0.4344, + "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems", ), ( - 0.4227, - "Time: 12:00 on Wednesday (afternoon) | Subject: version_control_style | Observation: The user prefers small, focused commits over large feature branches | Confidence: 0.8", + 0.429, + "Time: 12:00 on Wednesday (afternoon) | Subject: refactoring | Observation: The user always refactors to pure functions", ), ], }, "What technical skills is the user learning?": { "semantic": [ - (0.5765, "The user is currently learning Rust in their spare time"), + (0.5766, "The user is currently learning Rust in their spare time"), ( - 0.5502, + 0.55, "Subject: learning_activities | Type: general | Observation: The user is currently learning Rust in their spare time | Quote: I'm picking up Rust on weekends", ), - (0.5411, "I'm picking up Rust on weekends"), - (0.5155, "The user primarily works with Python and JavaScript"), + (0.5415, "I'm picking up Rust on weekends"), + (0.5156, "The user primarily works with Python and JavaScript"), ], "temporal": [ ( - 0.5301, - "Time: 12:00 on Wednesday (afternoon) | Subject: learning_activities | Observation: The user is currently learning Rust in their spare time | Confidence: 0.8", + 0.5221, + "Time: 12:00 on Wednesday (afternoon) | Subject: learning_activities | Observation: The user is currently learning Rust in their spare time", ), ( - 0.4913, - "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8", + 0.492, + "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript", ), ( - 0.481, - "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience | Confidence: 0.8", + 0.4871, + "Time: 12:00 on Wednesday (afternoon) | Subject: experience_level | Observation: The user has 8 years of professional programming experience", ), ( - 0.4558, - "Time: 12:00 on Wednesday (afternoon) | Subject: education_background | Observation: The user graduated with a Computer Science degree from Stanford | Confidence: 0.8", + 0.4547, + "Time: 12:00 on Wednesday (afternoon) | Subject: education_background | Observation: The user graduated with a Computer Science degree from Stanford", ), ], }, "What does the user think about cooking?": { "semantic": [ - (0.4888, "I find backend logic more interesting than UI work"), - (0.4624, "The user prefers working on backend systems over frontend UI"), + (0.4893, "I find backend logic more interesting than UI work"), + (0.4621, "The user prefers working on backend systems over frontend UI"), ( 0.4551, "The user believes functional programming leads to better code quality", ), - (0.4547, "The user said pure functions are yucky"), + (0.4549, "The user said pure functions are yucky"), ], "temporal": [ ( - 0.3812, - "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky | Confidence: 0.8", + 0.3785, + "Time: 12:00 on Wednesday (afternoon) | Subject: pure_functions | Observation: The user said pure functions are yucky", ), ( - 0.3773, - "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality | Confidence: 0.8", + 0.3759, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_philosophy | Observation: The user believes functional programming leads to better code quality", ), ( - 0.3686, - "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8", + 0.375, + "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose", ), ( - 0.3649, - "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + 0.3594, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI", ), ], }, "What are the user's travel preferences?": { "semantic": [ ( - 0.522, + 0.523, "Subject: domain_preference | Type: preference | Observation: The user prefers working on backend systems over frontend UI | Quote: I find backend logic more interesting than UI work", ), - (0.5145, "The user prefers functional programming over OOP"), - (0.5079, "The user prefers working on backend systems over frontend UI"), - (0.5045, "The user prefers working late at night"), + (0.5143, "The user prefers functional programming over OOP"), + (0.5074, "The user prefers working on backend systems over frontend UI"), + (0.5049, "The user prefers working late at night"), ], "temporal": [ ( - 0.4849, - "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI | Confidence: 0.8", + 0.4767, + "Time: 12:00 on Wednesday (afternoon) | Subject: domain_preference | Observation: The user prefers working on backend systems over frontend UI", ), ( - 0.4779, - "Time: 12:00 on Wednesday (afternoon) | Subject: database_preference | Observation: The user prefers PostgreSQL over MongoDB for most applications | Confidence: 0.8", + 0.4748, + "Time: 12:00 on Wednesday (afternoon) | Subject: database_preference | Observation: The user prefers PostgreSQL over MongoDB for most applications", ), ( - 0.4659, - "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems | Confidence: 0.8", + 0.4587, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP", ), ( - 0.4639, - "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8", + 0.4554, + "Time: 12:00 on Wednesday (afternoon) | Subject: collaboration_preference | Observation: The user prefers pair programming for complex problems", ), ], }, "What music does the user like?": { "semantic": [ ( - 0.4927, + 0.4933, "Subject: domain_preference | Type: preference | Observation: The user prefers working on backend systems over frontend UI | Quote: I find backend logic more interesting than UI work", ), (0.4906, "The user prefers working late at night"), - (0.4904, "The user prefers functional programming over OOP"), + (0.4902, "The user prefers functional programming over OOP"), (0.4894, "The user primarily works with Python and JavaScript"), ], "temporal": [ ( - 0.4674, - "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose | Confidence: 0.8", + 0.4676, + "Time: 12:00 on Wednesday (afternoon) | Subject: typescript_opinion | Observation: The user now says they love TypeScript but previously called it verbose", ), ( - 0.4548, - "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript | Confidence: 0.8", + 0.4561, + "Time: 12:00 on Wednesday (afternoon) | Subject: primary_languages | Observation: The user primarily works with Python and JavaScript", ), ( - 0.4518, - "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP | Confidence: 0.8", + 0.4471, + "Time: 12:00 on Wednesday (afternoon) | Subject: programming_paradigms | Observation: The user prefers functional programming over OOP", ), ( - 0.4496, - "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing | Confidence: 0.8", + 0.4432, + "Time: 12:00 on Wednesday (afternoon) | Subject: editor_preference | Observation: The user prefers Vim over VS Code for editing", ), ], }, @@ -1076,7 +1076,6 @@ def test_real_observation_embeddings(real_voyage_client, qdrant): tags=["bla"], observation_type=observation_type, subject=subject, - confidence=0.8, evidence={ "quote": quote, "source": "https://en.wikipedia.org/wiki/Human", @@ -1096,6 +1095,7 @@ def test_real_observation_embeddings(real_voyage_client, qdrant): ] for item in items: + item.update_confidences({"observation_accuracy": 0.8}) embed_source_item(item) push_to_qdrant(items) diff --git a/tests/memory/common/db/models/test_source_item_embeddings.py b/tests/memory/common/db/models/test_source_item_embeddings.py index 13a2257..579c7eb 100644 --- a/tests/memory/common/db/models/test_source_item_embeddings.py +++ b/tests/memory/common/db/models/test_source_item_embeddings.py @@ -583,7 +583,6 @@ def test_agent_observation_embeddings(mock_voyage_client): tags=["bla"], observation_type="belief", subject="humans", - confidence=0.8, evidence={ "quote": "All humans are mortal.", "source": "https://en.wikipedia.org/wiki/Human", @@ -591,6 +590,7 @@ def test_agent_observation_embeddings(mock_voyage_client): agent_model="gpt-4o", inserted_at=datetime(2025, 1, 1, 12, 0, 0), ) + item.update_confidences({"observation_accuracy": 0.8}) metadata = item.as_payload() metadata["tags"] = {"bla"} expected = [ @@ -600,7 +600,7 @@ def test_agent_observation_embeddings(mock_voyage_client): metadata | {"embedding_type": "semantic"}, ), ( - "Time: 12:00 on Wednesday (afternoon) | Subject: humans | Observation: The user thinks that all men must die. | Confidence: 0.8", + "Time: 12:00 on Wednesday (afternoon) | Subject: humans | Observation: The user thinks that all men must die.", [], metadata | {"embedding_type": "temporal"}, ), @@ -625,7 +625,7 @@ def test_agent_observation_embeddings(mock_voyage_client): assert mock_voyage_client.embed.call_args == call( [ "Subject: humans | Type: belief | Observation: The user thinks that all men must die. | Quote: All humans are mortal.", - "Time: 12:00 on Wednesday (afternoon) | Subject: humans | Observation: The user thinks that all men must die. | Confidence: 0.8", + "Time: 12:00 on Wednesday (afternoon) | Subject: humans | Observation: The user thinks that all men must die.", "The user thinks that all men must die.", "All humans are mortal.", ], diff --git a/tests/memory/common/db/models/test_source_items.py b/tests/memory/common/db/models/test_source_items.py index 1770026..8d838e4 100644 --- a/tests/memory/common/db/models/test_source_items.py +++ b/tests/memory/common/db/models/test_source_items.py @@ -499,7 +499,7 @@ def test_blog_post_chunk_contents_with_image_long_content(tmp_path, default_chun "size": None, "observation_type": "preference", "subject": "programming preferences", - "confidence": 0.9, + "confidence": {"observation_accuracy": 0.9}, "evidence": { "quote": "I really like Python", "context": "discussion about languages", @@ -513,7 +513,7 @@ def test_blog_post_chunk_contents_with_image_long_content(tmp_path, default_chun "size": None, "observation_type": "preference", "subject": "programming preferences", - "confidence": 0.9, + "confidence": {"observation_accuracy": 0.9}, "evidence": { "quote": "I really like Python", "context": "discussion about languages", @@ -531,7 +531,7 @@ def test_blog_post_chunk_contents_with_image_long_content(tmp_path, default_chun "size": None, "observation_type": "preference", "subject": "programming preferences", - "confidence": 0.9, + "confidence": {"observation_accuracy": 0.9}, "evidence": { "quote": "I really like Python", "context": "discussion about languages", @@ -546,7 +546,7 @@ def test_blog_post_chunk_contents_with_image_long_content(tmp_path, default_chun "size": None, "observation_type": "preference", "subject": "programming preferences", - "confidence": 0.9, + "confidence": {"observation_accuracy": 0.9}, "evidence": { "quote": "I really like Python", "context": "discussion about languages", @@ -565,7 +565,7 @@ def test_blog_post_chunk_contents_with_image_long_content(tmp_path, default_chun "size": None, "observation_type": "preference", "subject": "programming preferences", - "confidence": 0.9, + "confidence": {"observation_accuracy": 0.9}, "evidence": { "quote": "I really like Python", "context": "discussion about languages", @@ -580,7 +580,7 @@ def test_blog_post_chunk_contents_with_image_long_content(tmp_path, default_chun "size": None, "observation_type": "preference", "subject": "programming preferences", - "confidence": 0.9, + "confidence": {"observation_accuracy": 0.9}, "evidence": { "quote": "I really like Python", "context": "discussion about languages", @@ -603,7 +603,6 @@ def test_agent_observation_data_chunks( content="User prefers Python over JavaScript", subject="programming preferences", observation_type="preference", - confidence=0.9, evidence={ "quote": "I really like Python", "context": "discussion about languages", @@ -612,6 +611,7 @@ def test_agent_observation_data_chunks( session_id=session_id, tags=observation_tags, ) + observation.update_confidences({"observation_accuracy": 0.9}) # Set inserted_at using object.__setattr__ to bypass SQLAlchemy restrictions object.__setattr__(observation, "inserted_at", datetime(2023, 1, 1, 12, 0, 0)) @@ -634,7 +634,7 @@ def test_agent_observation_data_chunks( assert cast(str, semantic_chunk.collection_name) == "semantic" temporal_chunk = result[1] - expected_temporal_text = "Time: 12:00 on Sunday (afternoon) | Subject: programming preferences | Observation: User prefers Python over JavaScript | Confidence: 0.9" + expected_temporal_text = "Time: 12:00 on Sunday (afternoon) | Subject: programming preferences | Observation: User prefers Python over JavaScript" assert temporal_chunk.data == [expected_temporal_text] # Add session_id to expected metadata and remove tags if empty @@ -654,11 +654,11 @@ def test_agent_observation_data_chunks_with_none_values(): content="Content", subject="subject", observation_type="belief", - confidence=0.7, evidence=None, agent_model="gpt-4", session_id=None, ) + observation.update_confidences({"observation_accuracy": 0.7}) object.__setattr__(observation, "inserted_at", datetime(2023, 2, 15, 9, 30, 0)) result = observation.data_chunks() @@ -671,7 +671,7 @@ def test_agent_observation_data_chunks_with_none_values(): assert [i.data for i in result] == [ ["Subject: subject | Type: belief | Observation: Content"], [ - "Time: 09:30 on Wednesday (morning) | Subject: subject | Observation: Content | Confidence: 0.7" + "Time: 09:30 on Wednesday (morning) | Subject: subject | Observation: Content" ], ["Content"], ] @@ -684,11 +684,11 @@ def test_agent_observation_data_chunks_merge_metadata_behavior(): content="test", subject="test", observation_type="test", - confidence=0.8, evidence={}, agent_model="test", tags=["base_tag"], # Set base tags so they appear in both chunks ) + observation.update_confidences({"observation_accuracy": 0.9}) object.__setattr__(observation, "inserted_at", datetime.now()) # Test that metadata merging preserves original values and adds new ones @@ -723,11 +723,10 @@ def test_note_data_chunks(subject, content, expected): content=content, subject=subject, note_type="quicky", - confidence=0.9, size=123, tags=["bla"], ) - + note.update_confidences({"observation_accuracy": 0.9}) chunks = note.data_chunks() assert [chunk.content for chunk in chunks] == expected for chunk in chunks: @@ -736,7 +735,7 @@ def test_note_data_chunks(subject, content, expected): if cast(str, chunk.content) == "test summary": tags |= {"tag1", "tag2"} assert chunk.item_metadata == { - "confidence": 0.9, + "confidence": {"observation_accuracy": 0.9}, "note_type": "quicky", "size": 123, "source_id": None, diff --git a/tests/memory/common/formatters/test_observation.py b/tests/memory/common/formatters/test_observation.py index 044aa5a..e15634f 100644 --- a/tests/memory/common/formatters/test_observation.py +++ b/tests/memory/common/formatters/test_observation.py @@ -123,11 +123,10 @@ def test_generate_temporal_text_time_periods(hour: int, expected_period: str): result = generate_temporal_text( subject="test_subject", content="test_content", - confidence=0.8, created_at=test_date, ) time_str = test_date.strftime("%H:%M") - expected = f"Time: {time_str} on Monday ({expected_period}) | Subject: test_subject | Observation: test_content | Confidence: 0.8" + expected = f"Time: {time_str} on Monday ({expected_period}) | Subject: test_subject | Observation: test_content" assert result == expected @@ -146,7 +145,7 @@ def test_generate_temporal_text_time_periods(hour: int, expected_period: str): def test_generate_temporal_text_days_of_week(weekday: int, day_name: str): test_date = datetime(2024, 1, 15 + weekday, 10, 30) result = generate_temporal_text( - subject="subject", content="content", confidence=0.5, created_at=test_date + subject="subject", content="content", created_at=test_date ) assert f"on {day_name}" in result @@ -157,10 +156,8 @@ def test_generate_temporal_text_confidence_values(confidence: float): result = generate_temporal_text( subject="subject", content="content", - confidence=confidence, created_at=test_date, ) - assert f"Confidence: {confidence}" in result @pytest.mark.parametrize( @@ -180,7 +177,7 @@ def test_generate_temporal_text_boundary_cases( test_date: datetime, expected_period: str ): result = generate_temporal_text( - subject="subject", content="content", confidence=0.8, created_at=test_date + subject="subject", content="content", created_at=test_date ) assert f"({expected_period})" in result @@ -190,22 +187,16 @@ def test_generate_temporal_text_complete_format(): result = generate_temporal_text( subject="Important observation", content="User showed strong preference for X", - confidence=0.95, created_at=test_date, ) - expected = "Time: 14:45 on Friday (afternoon) | Subject: Important observation | Observation: User showed strong preference for X | Confidence: 0.95" + expected = "Time: 14:45 on Friday (afternoon) | Subject: Important observation | Observation: User showed strong preference for X" assert result == expected def test_generate_temporal_text_empty_strings(): test_date = datetime(2024, 1, 15, 10, 30) - result = generate_temporal_text( - subject="", content="", confidence=0.0, created_at=test_date - ) - assert ( - result - == "Time: 10:30 on Monday (morning) | Subject: | Observation: | Confidence: 0.0" - ) + result = generate_temporal_text(subject="", content="", created_at=test_date) + assert result == "Time: 10:30 on Monday (morning) | Subject: | Observation:" def test_generate_temporal_text_special_characters(): @@ -213,8 +204,7 @@ def test_generate_temporal_text_special_characters(): result = generate_temporal_text( subject="Subject with | pipe", content="Content with | pipe and @#$ symbols", - confidence=0.75, created_at=test_date, ) - expected = "Time: 15:20 on Monday (afternoon) | Subject: Subject with | pipe | Observation: Content with | pipe and @#$ symbols | Confidence: 0.75" + expected = "Time: 15:20 on Monday (afternoon) | Subject: Subject with | pipe | Observation: Content with | pipe and @#$ symbols" assert result == expected diff --git a/tests/memory/workers/tasks/test_notes_tasks.py b/tests/memory/workers/tasks/test_notes_tasks.py index 7a15f2c..92a0407 100644 --- a/tests/memory/workers/tasks/test_notes_tasks.py +++ b/tests/memory/workers/tasks/test_notes_tasks.py @@ -16,7 +16,7 @@ def mock_note_data(): "content": "This is test note content with enough text to be processed and embedded.", "filename": "test_note.md", "note_type": "observation", - "confidence": 0.8, + "confidences": {"observation_accuracy": 0.8}, "tags": ["test", "note"], } @@ -90,7 +90,7 @@ def test_sync_note_success(mock_note_data, db_session, qdrant): assert note.modality == "note" assert note.mime_type == "text/markdown" assert note.note_type == "observation" - assert float(note.confidence) == 0.8 # Convert Decimal to float for comparison + assert note.confidence_dict == {"observation_accuracy": 0.8} assert note.filename is not None assert note.tags == ["test", "note"] @@ -114,7 +114,7 @@ def test_sync_note_minimal_data(mock_minimal_note, db_session, qdrant): assert note.subject == "Minimal Note" assert note.content == "Minimal content" assert note.note_type is None - assert float(note.confidence) == 0.5 # Default value, convert Decimal to float + assert note.confidence_dict == {} assert note.tags == [] # Default empty list assert note.filename is not None and "Minimal Note.md" in note.filename @@ -205,6 +205,9 @@ def test_sync_note_edit(mock_note_data, db_session): embed_status="RAW", filename="test_note.md", ) + existing_note.update_confidences( + {"observation_accuracy": 0.2, "predictive_value": 0.3} + ) db_session.add(existing_note) db_session.commit() @@ -225,6 +228,10 @@ def test_sync_note_edit(mock_note_data, db_session): assert len(db_session.query(Note).all()) == 1 db_session.refresh(existing_note) assert existing_note.content == "bla bla bla" # type: ignore + assert existing_note.confidence_dict == { + "observation_accuracy": 0.8, + "predictive_value": 0.3, + } @pytest.mark.parametrize( @@ -242,14 +249,14 @@ def test_sync_note_parameters(note_type, confidence, tags, db_session, qdrant): subject=f"Test Note {note_type}", content="Test content for parameter testing", note_type=note_type, - confidence=confidence, + confidences={"observation_accuracy": confidence}, tags=tags, ) note = db_session.query(Note).filter_by(subject=f"Test Note {note_type}").first() assert note is not None assert note.note_type == note_type - assert float(note.confidence) == confidence # Convert Decimal to float + assert note.confidence_dict == {"observation_accuracy": confidence} assert note.tags == tags # Updated to match actual return format