From c6cd809eb7c7ec9458251c9a385f385556603f72 Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Sat, 3 May 2025 16:21:07 +0200 Subject: [PATCH] proper chunks path --- .gitignore | 3 +- docker-compose.yaml | 99 ++++++++++++++------------- docker/ingest_hub/Dockerfile | 5 +- docker/workers/Dockerfile | 2 + src/memory/common/embedding.py | 10 +-- src/memory/common/settings.py | 20 ++++-- tests/conftest.py | 13 ++-- tests/memory/common/test_embedding.py | 36 +++++----- 8 files changed, 106 insertions(+), 82 deletions(-) diff --git a/.gitignore b/.gitignore index 4af5d5d..0331bf8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +memory_files .env .DS_Store secrets/ @@ -13,4 +14,4 @@ __pycache__/ *.pyzw docker-compose.override.yml -docker/pgadmin \ No newline at end of file +docker/pgadmin diff --git a/docker-compose.yaml b/docker-compose.yaml index dd1f07a..25a16e4 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -2,21 +2,21 @@ version: "3.9" # --------------------------------------------------------------------- networks networks: - kbnet: # internal overlay – NOT exposed + kbnet: + # internal overlay – NOT exposed driver: bridge # --------------------------------------------------------------------- secrets secrets: - postgres_password: {file: ./secrets/postgres_password.txt} - jwt_secret: {file: ./secrets/jwt_secret.txt} - openai_key: {file: ./secrets/openai_key.txt} + postgres_password: { file: ./secrets/postgres_password.txt } + jwt_secret: { file: ./secrets/jwt_secret.txt } + openai_key: { file: ./secrets/openai_key.txt } # --------------------------------------------------------------------- volumes volumes: - db_data: {} # Postgres - qdrant_data: {} # Qdrant - rabbitmq_data: {} # RabbitMQ - file_storage: {} # File storage + db_data: {} # Postgres + qdrant_data: {} # Qdrant + rabbitmq_data: {} # RabbitMQ # ------------------------------ X-templates ---------------------------- x-common-env: &env @@ -26,60 +26,60 @@ x-common-env: &env FILE_STORAGE_DIR: /app/memory_files TZ: "Etc/UTC" - x-worker-base: &worker-base - build: + build: context: . dockerfile: docker/workers/Dockerfile restart: unless-stopped - networks: [kbnet] - security_opt: ["no-new-privileges=true"] - depends_on: [postgres, rabbitmq, qdrant] - env_file: [.env] + networks: [ kbnet ] + security_opt: [ "no-new-privileges=true" ] + depends_on: [ postgres, rabbitmq, qdrant ] + env_file: [ .env ] environment: &worker-env <<: *env POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password # DSNs are built in worker entrypoint from user + pw files QDRANT_URL: http://qdrant:6333 OPENAI_API_KEY_FILE: /run/secrets/openai_key - secrets: [postgres_password, openai_key] + secrets: [ postgres_password, openai_key ] read_only: true - tmpfs: [/tmp,/var/tmp] - cap_drop: [ALL] + tmpfs: [ /tmp, /var/tmp ] + cap_drop: [ ALL ] volumes: - - file_storage:/app/memory_files:rw + - ./memory_files:/app/memory_files:rw logging: - options: {max-size: "10m", max-file: "3"} + options: { max-size: "10m", max-file: "3" } + user: kb # ================================ SERVICES ============================ services: -# ----------------------------------------------------------------- data layer + # ----------------------------------------------------------------- data layer postgres: image: postgres:15 restart: unless-stopped - networks: [kbnet] + networks: [ kbnet ] environment: <<: *env POSTGRES_USER: kb POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password POSTGRES_DB: kb - secrets: [postgres_password] + secrets: [ postgres_password ] volumes: - db_data:/var/lib/postgresql/data:rw healthcheck: - test: ["CMD-SHELL", "pg_isready -U kb"] + test: [ "CMD-SHELL", "pg_isready -U kb" ] interval: 10s timeout: 5s retries: 5 mem_limit: 4g cpus: "1.5" - security_opt: ["no-new-privileges=true"] + security_opt: [ "no-new-privileges=true" ] rabbitmq: image: rabbitmq:3.13-management restart: unless-stopped - networks: [kbnet] + networks: [ kbnet ] environment: <<: *env RABBITMQ_DEFAULT_USER: "kb" @@ -87,20 +87,21 @@ services: volumes: - rabbitmq_data:/var/lib/rabbitmq:rw healthcheck: - test: ["CMD", "rabbitmq-diagnostics", "ping"] + test: [ "CMD", "rabbitmq-diagnostics", "ping" ] interval: 15s timeout: 5s retries: 5 mem_limit: 512m cpus: "0.5" - security_opt: ["no-new-privileges=true"] - ports: # UI only on localhost + security_opt: [ "no-new-privileges=true" ] + ports: + # UI only on localhost - "127.0.0.1:15672:15672" qdrant: image: qdrant/qdrant:v1.14.0 restart: unless-stopped - networks: [kbnet] + networks: [ kbnet ] volumes: - qdrant_data:/qdrant/storage:rw tmpfs: @@ -108,16 +109,16 @@ services: - /var/tmp - /qdrant/snapshots:rw healthcheck: - test: ["CMD", "wget", "-q", "-T", "2", "-O", "-", "localhost:6333/ready"] + test: [ "CMD", "wget", "-q", "-T", "2", "-O", "-", "localhost:6333/ready" ] interval: 15s timeout: 5s retries: 5 mem_limit: 4g cpus: "2" - security_opt: ["no-new-privileges=true"] - cap_drop: [ALL] + security_opt: [ "no-new-privileges=true" ] + cap_drop: [ ALL ] -# ------------------------------------------------------------ API / gateway + # ------------------------------------------------------------ API / gateway # api: # build: # context: . @@ -148,7 +149,7 @@ services: traefik: image: traefik:v3.0 restart: unless-stopped - networks: [kbnet] + networks: [ kbnet ] command: - "--providers.docker=true" - "--providers.docker.network=kbnet" @@ -166,79 +167,79 @@ services: - /var/run/docker.sock:/var/run/docker.sock:ro # - ./acme.json:/acme.json:rw -# ------------------------------------------------------------ Celery workers + # ------------------------------------------------------------ Celery workers worker-email: <<: *worker-base environment: <<: *worker-env QUEUES: "email" - deploy: {resources: {limits: {cpus: "2", memory: 3g}}} + deploy: { resources: { limits: { cpus: "2", memory: 3g } } } worker-text: <<: *worker-base environment: <<: *worker-env QUEUES: "medium_embed" - deploy: {resources: {limits: {cpus: "2", memory: 3g}}} + deploy: { resources: { limits: { cpus: "2", memory: 3g } } } worker-photo: <<: *worker-base environment: <<: *worker-env QUEUES: "photo_embed" - deploy: {resources: {limits: {cpus: "4", memory: 4g}}} + deploy: { resources: { limits: { cpus: "4", memory: 4g } } } worker-ocr: <<: *worker-base environment: <<: *worker-env QUEUES: "low_ocr" - deploy: {resources: {limits: {cpus: "4", memory: 4g}}} + deploy: { resources: { limits: { cpus: "4", memory: 4g } } } worker-git: <<: *worker-base environment: <<: *worker-env QUEUES: "git_summary" - deploy: {resources: {limits: {cpus: "1", memory: 1g}}} + deploy: { resources: { limits: { cpus: "1", memory: 1g } } } worker-rss: <<: *worker-base environment: <<: *worker-env QUEUES: "rss" - deploy: {resources: {limits: {cpus: "0.5", memory: 512m}}} + deploy: { resources: { limits: { cpus: "0.5", memory: 512m } } } worker-docs: <<: *worker-base environment: <<: *worker-env QUEUES: "docs" - deploy: {resources: {limits: {cpus: "1", memory: 1g}}} + deploy: { resources: { limits: { cpus: "1", memory: 1g } } } ingest-hub: <<: *worker-base - build: + build: context: . dockerfile: docker/ingest_hub/Dockerfile environment: <<: *worker-env volumes: - - file_storage:/app/memory_files:rw + - ./memory_files:/app/memory_files:rw tmpfs: - /tmp - /var/tmp - /var/log/supervisor - /var/run/supervisor - deploy: {resources: {limits: {cpus: "0.5", memory: 512m}}} + deploy: { resources: { limits: { cpus: "0.5", memory: 512m } } } -# ------------------------------------------------------------ watchtower (auto-update) + # ------------------------------------------------------------ watchtower (auto-update) watchtower: image: containrrr/watchtower restart: unless-stopped - command: ["--schedule", "0 0 4 * * *", "--cleanup"] - volumes: ["/var/run/docker.sock:/var/run/docker.sock:ro"] - networks: [kbnet] + command: [ "--schedule", "0 0 4 * * *", "--cleanup" ] + volumes: [ "/var/run/docker.sock:/var/run/docker.sock:ro" ] + networks: [ kbnet ] # ------------------------------------------------------------------- profiles: observability (opt-in) # services: diff --git a/docker/ingest_hub/Dockerfile b/docker/ingest_hub/Dockerfile index f2ef5e6..88d8981 100644 --- a/docker/ingest_hub/Dockerfile +++ b/docker/ingest_hub/Dockerfile @@ -21,8 +21,11 @@ RUN chmod +x entry.sh # Create required tmpfs directories for supervisor RUN mkdir -p /var/log/supervisor /var/run/supervisor +# Create storage directory +RUN mkdir -p /app/memory_files + # Create user and set permissions -RUN useradd -m kb && chown -R kb /app /var/log/supervisor /var/run/supervisor +RUN useradd -m kb && chown -R kb /app /var/log/supervisor /var/run/supervisor /app/memory_files USER kb # Default queues to process diff --git a/docker/workers/Dockerfile b/docker/workers/Dockerfile index c88d6eb..644eda7 100644 --- a/docker/workers/Dockerfile +++ b/docker/workers/Dockerfile @@ -17,6 +17,8 @@ RUN apt-get update && apt-get install -y \ COPY docker/workers/entry.sh ./entry.sh RUN chmod +x entry.sh +RUN mkdir -p /app/memory_files + # Create user and set permissions RUN useradd -m kb && chown -R kb /app USER kb diff --git a/src/memory/common/embedding.py b/src/memory/common/embedding.py index 5aed808..97f5fe7 100644 --- a/src/memory/common/embedding.py +++ b/src/memory/common/embedding.py @@ -128,13 +128,13 @@ def embed_page(page: dict[str, Any]) -> list[Vector]: def write_to_file(chunk_id: str, item: extract.MulitmodalChunk) -> pathlib.Path: if isinstance(item, str): - filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.txt" + filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.txt" filename.write_text(item) elif isinstance(item, bytes): - filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.bin" + filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.bin" filename.write_bytes(item) elif isinstance(item, Image.Image): - filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.png" + filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.png" item.save(filename) else: raise ValueError(f"Unsupported content type: {type(item)}") @@ -156,13 +156,13 @@ def make_chunk( content = "\n\n".join(contents) model = settings.TEXT_EMBEDDING_MODEL elif len(contents) == 1: - filename = (write_to_file(chunk_id, contents[0]),) + filename = write_to_file(chunk_id, contents[0]).absolute().as_posix() model = settings.MIXED_EMBEDDING_MODEL else: for i, item in enumerate(contents): write_to_file(f"{chunk_id}_{i}", item) model = settings.MIXED_EMBEDDING_MODEL - filename = (settings.FILE_STORAGE_DIR / f"{chunk_id}_*",) + filename = (settings.CHUNK_STORAGE_DIR / f"{chunk_id}_*").absolute().as_posix() return Chunk( id=chunk_id, diff --git a/src/memory/common/settings.py b/src/memory/common/settings.py index 9c5d8a9..0fa634c 100644 --- a/src/memory/common/settings.py +++ b/src/memory/common/settings.py @@ -4,9 +4,11 @@ from dotenv import load_dotenv load_dotenv() + def boolean_env(key: str, default: bool = False) -> bool: return os.getenv(key, "0").lower() in ("1", "true", "yes") + # Database settings DB_USER = os.getenv("DB_USER", "kb") if password_file := os.getenv("POSTGRES_PASSWORD_FILE"): @@ -18,23 +20,33 @@ DB_HOST = os.getenv("DB_HOST", "postgres") DB_PORT = os.getenv("DB_PORT", "5432") DB_NAME = os.getenv("DB_NAME", "kb") -def make_db_url(user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT, db=DB_NAME): + +def make_db_url( + user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT, db=DB_NAME +): return f"postgresql://{user}:{password}@{host}:{port}/{db}" + + DB_URL = os.getenv("DATABASE_URL", make_db_url()) FILE_STORAGE_DIR = pathlib.Path(os.getenv("FILE_STORAGE_DIR", "/tmp/memory_files")) FILE_STORAGE_DIR.mkdir(parents=True, exist_ok=True) - +CHUNK_STORAGE_DIR = pathlib.Path( + os.getenv("CHUNK_STORAGE_DIR", FILE_STORAGE_DIR / "chunks") +) +CHUNK_STORAGE_DIR.mkdir(parents=True, exist_ok=True) # Maximum attachment size to store directly in the database (10MB) -MAX_INLINE_ATTACHMENT_SIZE = int(os.getenv("MAX_INLINE_ATTACHMENT_SIZE", 1 * 1024 * 1024)) +MAX_INLINE_ATTACHMENT_SIZE = int( + os.getenv("MAX_INLINE_ATTACHMENT_SIZE", 1 * 1024 * 1024) +) # Qdrant settings QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant") QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333")) QDRANT_GRPC_PORT = int(os.getenv("QDRANT_GRPC_PORT", "6334")) QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", None) -QDRANT_PREFER_GRPC = boolean_env("QDRANT_PREFER_GRPC", False) +QDRANT_PREFER_GRPC = boolean_env("QDRANT_PREFER_GRPC", False) QDRANT_TIMEOUT = int(os.getenv("QDRANT_TIMEOUT", "60")) diff --git a/tests/conftest.py b/tests/conftest.py index e59e5b5..cb93b27 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -55,7 +55,7 @@ def drop_test_database(test_db_name: str) -> None: with admin_engine.connect() as conn: conn.execute(text("COMMIT")) # Close any open transaction - + # Terminate all connections to the database conn.execute( text( @@ -67,10 +67,10 @@ def drop_test_database(test_db_name: str) -> None: """ ) ) - + # Drop the database conn.execute(text(f"DROP DATABASE IF EXISTS {test_db_name}")) - + admin_engine.dispose() @@ -199,8 +199,11 @@ def email_provider(): @pytest.fixture(autouse=True) def mock_file_storage(tmp_path: Path): + chunk_storage_dir = tmp_path / "chunks" + chunk_storage_dir.mkdir(parents=True, exist_ok=True) with patch("memory.common.settings.FILE_STORAGE_DIR", tmp_path): - yield + with patch("memory.common.settings.CHUNK_STORAGE_DIR", chunk_storage_dir): + yield @pytest.fixture @@ -215,4 +218,4 @@ def qdrant(): @pytest.fixture(autouse=True) def mock_voyage_client(): with patch.object(voyageai, "Client", autospec=True) as mock_client: - yield mock_client() \ No newline at end of file + yield mock_client() diff --git a/tests/memory/common/test_embedding.py b/tests/memory/common/test_embedding.py index a619806..f177ad9 100644 --- a/tests/memory/common/test_embedding.py +++ b/tests/memory/common/test_embedding.py @@ -1,18 +1,20 @@ -import uuid -import pytest -from unittest.mock import Mock, patch -from PIL import Image import pathlib +import uuid +from unittest.mock import Mock, patch + +import pytest +from PIL import Image + from memory.common import settings from memory.common.embedding import ( - get_modality, - embed_text, + embed, embed_file, embed_mixed, embed_page, - embed, - write_to_file, + embed_text, + get_modality, make_chunk, + write_to_file, ) @@ -116,7 +118,7 @@ def test_write_to_file_text(mock_file_storage): file_path = write_to_file(chunk_id, content) - assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.txt" + assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.txt" assert file_path.exists() assert file_path.read_text() == content @@ -128,7 +130,7 @@ def test_write_to_file_bytes(mock_file_storage): file_path = write_to_file(chunk_id, content) - assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.bin" + assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.bin" assert file_path.exists() assert file_path.read_bytes() == content @@ -140,7 +142,7 @@ def test_write_to_file_image(mock_file_storage): file_path = write_to_file(chunk_id, img) - assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.png" + assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.png" assert file_path.exists() # Verify it's a valid image file by opening it image = Image.open(file_path) @@ -192,8 +194,8 @@ def test_make_chunk_single_image(mock_file_storage, db_session): assert chunk.id == "00000000-0000-0000-0000-000000000002" assert chunk.content is None - assert chunk.file_path == ( - settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000002.png", + assert chunk.file_path == str( + settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000002.png", ) assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL assert chunk.vector == vector @@ -217,8 +219,8 @@ def test_make_chunk_mixed_content(mock_file_storage, db_session): assert chunk.id == "00000000-0000-0000-0000-000000000003" assert chunk.content is None - assert chunk.file_path == ( - settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_*", + assert chunk.file_path == str( + settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_*", ) assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL assert chunk.vector == vector @@ -226,8 +228,8 @@ def test_make_chunk_mixed_content(mock_file_storage, db_session): # Verify the files exist assert ( - settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_0.txt" + settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_0.txt" ).exists() assert ( - settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_1.png" + settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_1.png" ).exists()