proper chunks path

This commit is contained in:
Daniel O'Connell 2025-05-03 16:21:07 +02:00
parent 44de394eb1
commit c6cd809eb7
8 changed files with 106 additions and 82 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
memory_files
.env .env
.DS_Store .DS_Store
secrets/ secrets/

View File

@ -2,21 +2,21 @@ version: "3.9"
# --------------------------------------------------------------------- networks # --------------------------------------------------------------------- networks
networks: networks:
kbnet: # internal overlay NOT exposed kbnet:
# internal overlay NOT exposed
driver: bridge driver: bridge
# --------------------------------------------------------------------- secrets # --------------------------------------------------------------------- secrets
secrets: secrets:
postgres_password: {file: ./secrets/postgres_password.txt} postgres_password: { file: ./secrets/postgres_password.txt }
jwt_secret: {file: ./secrets/jwt_secret.txt} jwt_secret: { file: ./secrets/jwt_secret.txt }
openai_key: {file: ./secrets/openai_key.txt} openai_key: { file: ./secrets/openai_key.txt }
# --------------------------------------------------------------------- volumes # --------------------------------------------------------------------- volumes
volumes: volumes:
db_data: {} # Postgres db_data: {} # Postgres
qdrant_data: {} # Qdrant qdrant_data: {} # Qdrant
rabbitmq_data: {} # RabbitMQ rabbitmq_data: {} # RabbitMQ
file_storage: {} # File storage
# ------------------------------ X-templates ---------------------------- # ------------------------------ X-templates ----------------------------
x-common-env: &env x-common-env: &env
@ -26,60 +26,60 @@ x-common-env: &env
FILE_STORAGE_DIR: /app/memory_files FILE_STORAGE_DIR: /app/memory_files
TZ: "Etc/UTC" TZ: "Etc/UTC"
x-worker-base: &worker-base x-worker-base: &worker-base
build: build:
context: . context: .
dockerfile: docker/workers/Dockerfile dockerfile: docker/workers/Dockerfile
restart: unless-stopped restart: unless-stopped
networks: [kbnet] networks: [ kbnet ]
security_opt: ["no-new-privileges=true"] security_opt: [ "no-new-privileges=true" ]
depends_on: [postgres, rabbitmq, qdrant] depends_on: [ postgres, rabbitmq, qdrant ]
env_file: [.env] env_file: [ .env ]
environment: &worker-env environment: &worker-env
<<: *env <<: *env
POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
# DSNs are built in worker entrypoint from user + pw files # DSNs are built in worker entrypoint from user + pw files
QDRANT_URL: http://qdrant:6333 QDRANT_URL: http://qdrant:6333
OPENAI_API_KEY_FILE: /run/secrets/openai_key OPENAI_API_KEY_FILE: /run/secrets/openai_key
secrets: [postgres_password, openai_key] secrets: [ postgres_password, openai_key ]
read_only: true read_only: true
tmpfs: [/tmp,/var/tmp] tmpfs: [ /tmp, /var/tmp ]
cap_drop: [ALL] cap_drop: [ ALL ]
volumes: volumes:
- file_storage:/app/memory_files:rw - ./memory_files:/app/memory_files:rw
logging: logging:
options: {max-size: "10m", max-file: "3"} options: { max-size: "10m", max-file: "3" }
user: kb
# ================================ SERVICES ============================ # ================================ SERVICES ============================
services: services:
# ----------------------------------------------------------------- data layer # ----------------------------------------------------------------- data layer
postgres: postgres:
image: postgres:15 image: postgres:15
restart: unless-stopped restart: unless-stopped
networks: [kbnet] networks: [ kbnet ]
environment: environment:
<<: *env <<: *env
POSTGRES_USER: kb POSTGRES_USER: kb
POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
POSTGRES_DB: kb POSTGRES_DB: kb
secrets: [postgres_password] secrets: [ postgres_password ]
volumes: volumes:
- db_data:/var/lib/postgresql/data:rw - db_data:/var/lib/postgresql/data:rw
healthcheck: healthcheck:
test: ["CMD-SHELL", "pg_isready -U kb"] test: [ "CMD-SHELL", "pg_isready -U kb" ]
interval: 10s interval: 10s
timeout: 5s timeout: 5s
retries: 5 retries: 5
mem_limit: 4g mem_limit: 4g
cpus: "1.5" cpus: "1.5"
security_opt: ["no-new-privileges=true"] security_opt: [ "no-new-privileges=true" ]
rabbitmq: rabbitmq:
image: rabbitmq:3.13-management image: rabbitmq:3.13-management
restart: unless-stopped restart: unless-stopped
networks: [kbnet] networks: [ kbnet ]
environment: environment:
<<: *env <<: *env
RABBITMQ_DEFAULT_USER: "kb" RABBITMQ_DEFAULT_USER: "kb"
@ -87,20 +87,21 @@ services:
volumes: volumes:
- rabbitmq_data:/var/lib/rabbitmq:rw - rabbitmq_data:/var/lib/rabbitmq:rw
healthcheck: healthcheck:
test: ["CMD", "rabbitmq-diagnostics", "ping"] test: [ "CMD", "rabbitmq-diagnostics", "ping" ]
interval: 15s interval: 15s
timeout: 5s timeout: 5s
retries: 5 retries: 5
mem_limit: 512m mem_limit: 512m
cpus: "0.5" cpus: "0.5"
security_opt: ["no-new-privileges=true"] security_opt: [ "no-new-privileges=true" ]
ports: # UI only on localhost ports:
# UI only on localhost
- "127.0.0.1:15672:15672" - "127.0.0.1:15672:15672"
qdrant: qdrant:
image: qdrant/qdrant:v1.14.0 image: qdrant/qdrant:v1.14.0
restart: unless-stopped restart: unless-stopped
networks: [kbnet] networks: [ kbnet ]
volumes: volumes:
- qdrant_data:/qdrant/storage:rw - qdrant_data:/qdrant/storage:rw
tmpfs: tmpfs:
@ -108,16 +109,16 @@ services:
- /var/tmp - /var/tmp
- /qdrant/snapshots:rw - /qdrant/snapshots:rw
healthcheck: healthcheck:
test: ["CMD", "wget", "-q", "-T", "2", "-O", "-", "localhost:6333/ready"] test: [ "CMD", "wget", "-q", "-T", "2", "-O", "-", "localhost:6333/ready" ]
interval: 15s interval: 15s
timeout: 5s timeout: 5s
retries: 5 retries: 5
mem_limit: 4g mem_limit: 4g
cpus: "2" cpus: "2"
security_opt: ["no-new-privileges=true"] security_opt: [ "no-new-privileges=true" ]
cap_drop: [ALL] cap_drop: [ ALL ]
# ------------------------------------------------------------ API / gateway # ------------------------------------------------------------ API / gateway
# api: # api:
# build: # build:
# context: . # context: .
@ -148,7 +149,7 @@ services:
traefik: traefik:
image: traefik:v3.0 image: traefik:v3.0
restart: unless-stopped restart: unless-stopped
networks: [kbnet] networks: [ kbnet ]
command: command:
- "--providers.docker=true" - "--providers.docker=true"
- "--providers.docker.network=kbnet" - "--providers.docker.network=kbnet"
@ -166,55 +167,55 @@ services:
- /var/run/docker.sock:/var/run/docker.sock:ro - /var/run/docker.sock:/var/run/docker.sock:ro
# - ./acme.json:/acme.json:rw # - ./acme.json:/acme.json:rw
# ------------------------------------------------------------ Celery workers # ------------------------------------------------------------ Celery workers
worker-email: worker-email:
<<: *worker-base <<: *worker-base
environment: environment:
<<: *worker-env <<: *worker-env
QUEUES: "email" QUEUES: "email"
deploy: {resources: {limits: {cpus: "2", memory: 3g}}} deploy: { resources: { limits: { cpus: "2", memory: 3g } } }
worker-text: worker-text:
<<: *worker-base <<: *worker-base
environment: environment:
<<: *worker-env <<: *worker-env
QUEUES: "medium_embed" QUEUES: "medium_embed"
deploy: {resources: {limits: {cpus: "2", memory: 3g}}} deploy: { resources: { limits: { cpus: "2", memory: 3g } } }
worker-photo: worker-photo:
<<: *worker-base <<: *worker-base
environment: environment:
<<: *worker-env <<: *worker-env
QUEUES: "photo_embed" QUEUES: "photo_embed"
deploy: {resources: {limits: {cpus: "4", memory: 4g}}} deploy: { resources: { limits: { cpus: "4", memory: 4g } } }
worker-ocr: worker-ocr:
<<: *worker-base <<: *worker-base
environment: environment:
<<: *worker-env <<: *worker-env
QUEUES: "low_ocr" QUEUES: "low_ocr"
deploy: {resources: {limits: {cpus: "4", memory: 4g}}} deploy: { resources: { limits: { cpus: "4", memory: 4g } } }
worker-git: worker-git:
<<: *worker-base <<: *worker-base
environment: environment:
<<: *worker-env <<: *worker-env
QUEUES: "git_summary" QUEUES: "git_summary"
deploy: {resources: {limits: {cpus: "1", memory: 1g}}} deploy: { resources: { limits: { cpus: "1", memory: 1g } } }
worker-rss: worker-rss:
<<: *worker-base <<: *worker-base
environment: environment:
<<: *worker-env <<: *worker-env
QUEUES: "rss" QUEUES: "rss"
deploy: {resources: {limits: {cpus: "0.5", memory: 512m}}} deploy: { resources: { limits: { cpus: "0.5", memory: 512m } } }
worker-docs: worker-docs:
<<: *worker-base <<: *worker-base
environment: environment:
<<: *worker-env <<: *worker-env
QUEUES: "docs" QUEUES: "docs"
deploy: {resources: {limits: {cpus: "1", memory: 1g}}} deploy: { resources: { limits: { cpus: "1", memory: 1g } } }
ingest-hub: ingest-hub:
<<: *worker-base <<: *worker-base
@ -224,21 +225,21 @@ services:
environment: environment:
<<: *worker-env <<: *worker-env
volumes: volumes:
- file_storage:/app/memory_files:rw - ./memory_files:/app/memory_files:rw
tmpfs: tmpfs:
- /tmp - /tmp
- /var/tmp - /var/tmp
- /var/log/supervisor - /var/log/supervisor
- /var/run/supervisor - /var/run/supervisor
deploy: {resources: {limits: {cpus: "0.5", memory: 512m}}} deploy: { resources: { limits: { cpus: "0.5", memory: 512m } } }
# ------------------------------------------------------------ watchtower (auto-update) # ------------------------------------------------------------ watchtower (auto-update)
watchtower: watchtower:
image: containrrr/watchtower image: containrrr/watchtower
restart: unless-stopped restart: unless-stopped
command: ["--schedule", "0 0 4 * * *", "--cleanup"] command: [ "--schedule", "0 0 4 * * *", "--cleanup" ]
volumes: ["/var/run/docker.sock:/var/run/docker.sock:ro"] volumes: [ "/var/run/docker.sock:/var/run/docker.sock:ro" ]
networks: [kbnet] networks: [ kbnet ]
# ------------------------------------------------------------------- profiles: observability (opt-in) # ------------------------------------------------------------------- profiles: observability (opt-in)
# services: # services:

View File

@ -21,8 +21,11 @@ RUN chmod +x entry.sh
# Create required tmpfs directories for supervisor # Create required tmpfs directories for supervisor
RUN mkdir -p /var/log/supervisor /var/run/supervisor RUN mkdir -p /var/log/supervisor /var/run/supervisor
# Create storage directory
RUN mkdir -p /app/memory_files
# Create user and set permissions # Create user and set permissions
RUN useradd -m kb && chown -R kb /app /var/log/supervisor /var/run/supervisor RUN useradd -m kb && chown -R kb /app /var/log/supervisor /var/run/supervisor /app/memory_files
USER kb USER kb
# Default queues to process # Default queues to process

View File

@ -17,6 +17,8 @@ RUN apt-get update && apt-get install -y \
COPY docker/workers/entry.sh ./entry.sh COPY docker/workers/entry.sh ./entry.sh
RUN chmod +x entry.sh RUN chmod +x entry.sh
RUN mkdir -p /app/memory_files
# Create user and set permissions # Create user and set permissions
RUN useradd -m kb && chown -R kb /app RUN useradd -m kb && chown -R kb /app
USER kb USER kb

View File

@ -128,13 +128,13 @@ def embed_page(page: dict[str, Any]) -> list[Vector]:
def write_to_file(chunk_id: str, item: extract.MulitmodalChunk) -> pathlib.Path: def write_to_file(chunk_id: str, item: extract.MulitmodalChunk) -> pathlib.Path:
if isinstance(item, str): if isinstance(item, str):
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.txt" filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.txt"
filename.write_text(item) filename.write_text(item)
elif isinstance(item, bytes): elif isinstance(item, bytes):
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.bin" filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.bin"
filename.write_bytes(item) filename.write_bytes(item)
elif isinstance(item, Image.Image): elif isinstance(item, Image.Image):
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.png" filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.png"
item.save(filename) item.save(filename)
else: else:
raise ValueError(f"Unsupported content type: {type(item)}") raise ValueError(f"Unsupported content type: {type(item)}")
@ -156,13 +156,13 @@ def make_chunk(
content = "\n\n".join(contents) content = "\n\n".join(contents)
model = settings.TEXT_EMBEDDING_MODEL model = settings.TEXT_EMBEDDING_MODEL
elif len(contents) == 1: elif len(contents) == 1:
filename = (write_to_file(chunk_id, contents[0]),) filename = write_to_file(chunk_id, contents[0]).absolute().as_posix()
model = settings.MIXED_EMBEDDING_MODEL model = settings.MIXED_EMBEDDING_MODEL
else: else:
for i, item in enumerate(contents): for i, item in enumerate(contents):
write_to_file(f"{chunk_id}_{i}", item) write_to_file(f"{chunk_id}_{i}", item)
model = settings.MIXED_EMBEDDING_MODEL model = settings.MIXED_EMBEDDING_MODEL
filename = (settings.FILE_STORAGE_DIR / f"{chunk_id}_*",) filename = (settings.CHUNK_STORAGE_DIR / f"{chunk_id}_*").absolute().as_posix()
return Chunk( return Chunk(
id=chunk_id, id=chunk_id,

View File

@ -4,9 +4,11 @@ from dotenv import load_dotenv
load_dotenv() load_dotenv()
def boolean_env(key: str, default: bool = False) -> bool: def boolean_env(key: str, default: bool = False) -> bool:
return os.getenv(key, "0").lower() in ("1", "true", "yes") return os.getenv(key, "0").lower() in ("1", "true", "yes")
# Database settings # Database settings
DB_USER = os.getenv("DB_USER", "kb") DB_USER = os.getenv("DB_USER", "kb")
if password_file := os.getenv("POSTGRES_PASSWORD_FILE"): if password_file := os.getenv("POSTGRES_PASSWORD_FILE"):
@ -18,16 +20,26 @@ DB_HOST = os.getenv("DB_HOST", "postgres")
DB_PORT = os.getenv("DB_PORT", "5432") DB_PORT = os.getenv("DB_PORT", "5432")
DB_NAME = os.getenv("DB_NAME", "kb") DB_NAME = os.getenv("DB_NAME", "kb")
def make_db_url(user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT, db=DB_NAME):
def make_db_url(
user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT, db=DB_NAME
):
return f"postgresql://{user}:{password}@{host}:{port}/{db}" return f"postgresql://{user}:{password}@{host}:{port}/{db}"
DB_URL = os.getenv("DATABASE_URL", make_db_url()) DB_URL = os.getenv("DATABASE_URL", make_db_url())
FILE_STORAGE_DIR = pathlib.Path(os.getenv("FILE_STORAGE_DIR", "/tmp/memory_files")) FILE_STORAGE_DIR = pathlib.Path(os.getenv("FILE_STORAGE_DIR", "/tmp/memory_files"))
FILE_STORAGE_DIR.mkdir(parents=True, exist_ok=True) FILE_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
CHUNK_STORAGE_DIR = pathlib.Path(
os.getenv("CHUNK_STORAGE_DIR", FILE_STORAGE_DIR / "chunks")
)
CHUNK_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
# Maximum attachment size to store directly in the database (10MB) # Maximum attachment size to store directly in the database (10MB)
MAX_INLINE_ATTACHMENT_SIZE = int(os.getenv("MAX_INLINE_ATTACHMENT_SIZE", 1 * 1024 * 1024)) MAX_INLINE_ATTACHMENT_SIZE = int(
os.getenv("MAX_INLINE_ATTACHMENT_SIZE", 1 * 1024 * 1024)
)
# Qdrant settings # Qdrant settings
QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant") QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant")

View File

@ -199,8 +199,11 @@ def email_provider():
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def mock_file_storage(tmp_path: Path): def mock_file_storage(tmp_path: Path):
chunk_storage_dir = tmp_path / "chunks"
chunk_storage_dir.mkdir(parents=True, exist_ok=True)
with patch("memory.common.settings.FILE_STORAGE_DIR", tmp_path): with patch("memory.common.settings.FILE_STORAGE_DIR", tmp_path):
yield with patch("memory.common.settings.CHUNK_STORAGE_DIR", chunk_storage_dir):
yield
@pytest.fixture @pytest.fixture

View File

@ -1,18 +1,20 @@
import uuid
import pytest
from unittest.mock import Mock, patch
from PIL import Image
import pathlib import pathlib
import uuid
from unittest.mock import Mock, patch
import pytest
from PIL import Image
from memory.common import settings from memory.common import settings
from memory.common.embedding import ( from memory.common.embedding import (
get_modality, embed,
embed_text,
embed_file, embed_file,
embed_mixed, embed_mixed,
embed_page, embed_page,
embed, embed_text,
write_to_file, get_modality,
make_chunk, make_chunk,
write_to_file,
) )
@ -116,7 +118,7 @@ def test_write_to_file_text(mock_file_storage):
file_path = write_to_file(chunk_id, content) file_path = write_to_file(chunk_id, content)
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.txt" assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.txt"
assert file_path.exists() assert file_path.exists()
assert file_path.read_text() == content assert file_path.read_text() == content
@ -128,7 +130,7 @@ def test_write_to_file_bytes(mock_file_storage):
file_path = write_to_file(chunk_id, content) file_path = write_to_file(chunk_id, content)
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.bin" assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.bin"
assert file_path.exists() assert file_path.exists()
assert file_path.read_bytes() == content assert file_path.read_bytes() == content
@ -140,7 +142,7 @@ def test_write_to_file_image(mock_file_storage):
file_path = write_to_file(chunk_id, img) file_path = write_to_file(chunk_id, img)
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.png" assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.png"
assert file_path.exists() assert file_path.exists()
# Verify it's a valid image file by opening it # Verify it's a valid image file by opening it
image = Image.open(file_path) image = Image.open(file_path)
@ -192,8 +194,8 @@ def test_make_chunk_single_image(mock_file_storage, db_session):
assert chunk.id == "00000000-0000-0000-0000-000000000002" assert chunk.id == "00000000-0000-0000-0000-000000000002"
assert chunk.content is None assert chunk.content is None
assert chunk.file_path == ( assert chunk.file_path == str(
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000002.png", settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000002.png",
) )
assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL
assert chunk.vector == vector assert chunk.vector == vector
@ -217,8 +219,8 @@ def test_make_chunk_mixed_content(mock_file_storage, db_session):
assert chunk.id == "00000000-0000-0000-0000-000000000003" assert chunk.id == "00000000-0000-0000-0000-000000000003"
assert chunk.content is None assert chunk.content is None
assert chunk.file_path == ( assert chunk.file_path == str(
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_*", settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_*",
) )
assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL
assert chunk.vector == vector assert chunk.vector == vector
@ -226,8 +228,8 @@ def test_make_chunk_mixed_content(mock_file_storage, db_session):
# Verify the files exist # Verify the files exist
assert ( assert (
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_0.txt" settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_0.txt"
).exists() ).exists()
assert ( assert (
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_1.png" settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_1.png"
).exists() ).exists()