proper chunks path

This commit is contained in:
Daniel O'Connell 2025-05-03 16:21:07 +02:00
parent 44de394eb1
commit c6cd809eb7
8 changed files with 106 additions and 82 deletions

3
.gitignore vendored
View File

@ -1,3 +1,4 @@
memory_files
.env
.DS_Store
secrets/
@ -13,4 +14,4 @@ __pycache__/
*.pyzw
docker-compose.override.yml
docker/pgadmin
docker/pgadmin

View File

@ -2,21 +2,21 @@ version: "3.9"
# --------------------------------------------------------------------- networks
networks:
kbnet: # internal overlay NOT exposed
kbnet:
# internal overlay NOT exposed
driver: bridge
# --------------------------------------------------------------------- secrets
secrets:
postgres_password: {file: ./secrets/postgres_password.txt}
jwt_secret: {file: ./secrets/jwt_secret.txt}
openai_key: {file: ./secrets/openai_key.txt}
postgres_password: { file: ./secrets/postgres_password.txt }
jwt_secret: { file: ./secrets/jwt_secret.txt }
openai_key: { file: ./secrets/openai_key.txt }
# --------------------------------------------------------------------- volumes
volumes:
db_data: {} # Postgres
qdrant_data: {} # Qdrant
rabbitmq_data: {} # RabbitMQ
file_storage: {} # File storage
db_data: {} # Postgres
qdrant_data: {} # Qdrant
rabbitmq_data: {} # RabbitMQ
# ------------------------------ X-templates ----------------------------
x-common-env: &env
@ -26,60 +26,60 @@ x-common-env: &env
FILE_STORAGE_DIR: /app/memory_files
TZ: "Etc/UTC"
x-worker-base: &worker-base
build:
build:
context: .
dockerfile: docker/workers/Dockerfile
restart: unless-stopped
networks: [kbnet]
security_opt: ["no-new-privileges=true"]
depends_on: [postgres, rabbitmq, qdrant]
env_file: [.env]
networks: [ kbnet ]
security_opt: [ "no-new-privileges=true" ]
depends_on: [ postgres, rabbitmq, qdrant ]
env_file: [ .env ]
environment: &worker-env
<<: *env
POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
# DSNs are built in worker entrypoint from user + pw files
QDRANT_URL: http://qdrant:6333
OPENAI_API_KEY_FILE: /run/secrets/openai_key
secrets: [postgres_password, openai_key]
secrets: [ postgres_password, openai_key ]
read_only: true
tmpfs: [/tmp,/var/tmp]
cap_drop: [ALL]
tmpfs: [ /tmp, /var/tmp ]
cap_drop: [ ALL ]
volumes:
- file_storage:/app/memory_files:rw
- ./memory_files:/app/memory_files:rw
logging:
options: {max-size: "10m", max-file: "3"}
options: { max-size: "10m", max-file: "3" }
user: kb
# ================================ SERVICES ============================
services:
# ----------------------------------------------------------------- data layer
# ----------------------------------------------------------------- data layer
postgres:
image: postgres:15
restart: unless-stopped
networks: [kbnet]
networks: [ kbnet ]
environment:
<<: *env
POSTGRES_USER: kb
POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
POSTGRES_DB: kb
secrets: [postgres_password]
secrets: [ postgres_password ]
volumes:
- db_data:/var/lib/postgresql/data:rw
healthcheck:
test: ["CMD-SHELL", "pg_isready -U kb"]
test: [ "CMD-SHELL", "pg_isready -U kb" ]
interval: 10s
timeout: 5s
retries: 5
mem_limit: 4g
cpus: "1.5"
security_opt: ["no-new-privileges=true"]
security_opt: [ "no-new-privileges=true" ]
rabbitmq:
image: rabbitmq:3.13-management
restart: unless-stopped
networks: [kbnet]
networks: [ kbnet ]
environment:
<<: *env
RABBITMQ_DEFAULT_USER: "kb"
@ -87,20 +87,21 @@ services:
volumes:
- rabbitmq_data:/var/lib/rabbitmq:rw
healthcheck:
test: ["CMD", "rabbitmq-diagnostics", "ping"]
test: [ "CMD", "rabbitmq-diagnostics", "ping" ]
interval: 15s
timeout: 5s
retries: 5
mem_limit: 512m
cpus: "0.5"
security_opt: ["no-new-privileges=true"]
ports: # UI only on localhost
security_opt: [ "no-new-privileges=true" ]
ports:
# UI only on localhost
- "127.0.0.1:15672:15672"
qdrant:
image: qdrant/qdrant:v1.14.0
restart: unless-stopped
networks: [kbnet]
networks: [ kbnet ]
volumes:
- qdrant_data:/qdrant/storage:rw
tmpfs:
@ -108,16 +109,16 @@ services:
- /var/tmp
- /qdrant/snapshots:rw
healthcheck:
test: ["CMD", "wget", "-q", "-T", "2", "-O", "-", "localhost:6333/ready"]
test: [ "CMD", "wget", "-q", "-T", "2", "-O", "-", "localhost:6333/ready" ]
interval: 15s
timeout: 5s
retries: 5
mem_limit: 4g
cpus: "2"
security_opt: ["no-new-privileges=true"]
cap_drop: [ALL]
security_opt: [ "no-new-privileges=true" ]
cap_drop: [ ALL ]
# ------------------------------------------------------------ API / gateway
# ------------------------------------------------------------ API / gateway
# api:
# build:
# context: .
@ -148,7 +149,7 @@ services:
traefik:
image: traefik:v3.0
restart: unless-stopped
networks: [kbnet]
networks: [ kbnet ]
command:
- "--providers.docker=true"
- "--providers.docker.network=kbnet"
@ -166,79 +167,79 @@ services:
- /var/run/docker.sock:/var/run/docker.sock:ro
# - ./acme.json:/acme.json:rw
# ------------------------------------------------------------ Celery workers
# ------------------------------------------------------------ Celery workers
worker-email:
<<: *worker-base
environment:
<<: *worker-env
QUEUES: "email"
deploy: {resources: {limits: {cpus: "2", memory: 3g}}}
deploy: { resources: { limits: { cpus: "2", memory: 3g } } }
worker-text:
<<: *worker-base
environment:
<<: *worker-env
QUEUES: "medium_embed"
deploy: {resources: {limits: {cpus: "2", memory: 3g}}}
deploy: { resources: { limits: { cpus: "2", memory: 3g } } }
worker-photo:
<<: *worker-base
environment:
<<: *worker-env
QUEUES: "photo_embed"
deploy: {resources: {limits: {cpus: "4", memory: 4g}}}
deploy: { resources: { limits: { cpus: "4", memory: 4g } } }
worker-ocr:
<<: *worker-base
environment:
<<: *worker-env
QUEUES: "low_ocr"
deploy: {resources: {limits: {cpus: "4", memory: 4g}}}
deploy: { resources: { limits: { cpus: "4", memory: 4g } } }
worker-git:
<<: *worker-base
environment:
<<: *worker-env
QUEUES: "git_summary"
deploy: {resources: {limits: {cpus: "1", memory: 1g}}}
deploy: { resources: { limits: { cpus: "1", memory: 1g } } }
worker-rss:
<<: *worker-base
environment:
<<: *worker-env
QUEUES: "rss"
deploy: {resources: {limits: {cpus: "0.5", memory: 512m}}}
deploy: { resources: { limits: { cpus: "0.5", memory: 512m } } }
worker-docs:
<<: *worker-base
environment:
<<: *worker-env
QUEUES: "docs"
deploy: {resources: {limits: {cpus: "1", memory: 1g}}}
deploy: { resources: { limits: { cpus: "1", memory: 1g } } }
ingest-hub:
<<: *worker-base
build:
build:
context: .
dockerfile: docker/ingest_hub/Dockerfile
environment:
<<: *worker-env
volumes:
- file_storage:/app/memory_files:rw
- ./memory_files:/app/memory_files:rw
tmpfs:
- /tmp
- /var/tmp
- /var/log/supervisor
- /var/run/supervisor
deploy: {resources: {limits: {cpus: "0.5", memory: 512m}}}
deploy: { resources: { limits: { cpus: "0.5", memory: 512m } } }
# ------------------------------------------------------------ watchtower (auto-update)
# ------------------------------------------------------------ watchtower (auto-update)
watchtower:
image: containrrr/watchtower
restart: unless-stopped
command: ["--schedule", "0 0 4 * * *", "--cleanup"]
volumes: ["/var/run/docker.sock:/var/run/docker.sock:ro"]
networks: [kbnet]
command: [ "--schedule", "0 0 4 * * *", "--cleanup" ]
volumes: [ "/var/run/docker.sock:/var/run/docker.sock:ro" ]
networks: [ kbnet ]
# ------------------------------------------------------------------- profiles: observability (opt-in)
# services:

View File

@ -21,8 +21,11 @@ RUN chmod +x entry.sh
# Create required tmpfs directories for supervisor
RUN mkdir -p /var/log/supervisor /var/run/supervisor
# Create storage directory
RUN mkdir -p /app/memory_files
# Create user and set permissions
RUN useradd -m kb && chown -R kb /app /var/log/supervisor /var/run/supervisor
RUN useradd -m kb && chown -R kb /app /var/log/supervisor /var/run/supervisor /app/memory_files
USER kb
# Default queues to process

View File

@ -17,6 +17,8 @@ RUN apt-get update && apt-get install -y \
COPY docker/workers/entry.sh ./entry.sh
RUN chmod +x entry.sh
RUN mkdir -p /app/memory_files
# Create user and set permissions
RUN useradd -m kb && chown -R kb /app
USER kb

View File

@ -128,13 +128,13 @@ def embed_page(page: dict[str, Any]) -> list[Vector]:
def write_to_file(chunk_id: str, item: extract.MulitmodalChunk) -> pathlib.Path:
if isinstance(item, str):
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.txt"
filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.txt"
filename.write_text(item)
elif isinstance(item, bytes):
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.bin"
filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.bin"
filename.write_bytes(item)
elif isinstance(item, Image.Image):
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.png"
filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.png"
item.save(filename)
else:
raise ValueError(f"Unsupported content type: {type(item)}")
@ -156,13 +156,13 @@ def make_chunk(
content = "\n\n".join(contents)
model = settings.TEXT_EMBEDDING_MODEL
elif len(contents) == 1:
filename = (write_to_file(chunk_id, contents[0]),)
filename = write_to_file(chunk_id, contents[0]).absolute().as_posix()
model = settings.MIXED_EMBEDDING_MODEL
else:
for i, item in enumerate(contents):
write_to_file(f"{chunk_id}_{i}", item)
model = settings.MIXED_EMBEDDING_MODEL
filename = (settings.FILE_STORAGE_DIR / f"{chunk_id}_*",)
filename = (settings.CHUNK_STORAGE_DIR / f"{chunk_id}_*").absolute().as_posix()
return Chunk(
id=chunk_id,

View File

@ -4,9 +4,11 @@ from dotenv import load_dotenv
load_dotenv()
def boolean_env(key: str, default: bool = False) -> bool:
return os.getenv(key, "0").lower() in ("1", "true", "yes")
# Database settings
DB_USER = os.getenv("DB_USER", "kb")
if password_file := os.getenv("POSTGRES_PASSWORD_FILE"):
@ -18,23 +20,33 @@ DB_HOST = os.getenv("DB_HOST", "postgres")
DB_PORT = os.getenv("DB_PORT", "5432")
DB_NAME = os.getenv("DB_NAME", "kb")
def make_db_url(user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT, db=DB_NAME):
def make_db_url(
user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT, db=DB_NAME
):
return f"postgresql://{user}:{password}@{host}:{port}/{db}"
DB_URL = os.getenv("DATABASE_URL", make_db_url())
FILE_STORAGE_DIR = pathlib.Path(os.getenv("FILE_STORAGE_DIR", "/tmp/memory_files"))
FILE_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
CHUNK_STORAGE_DIR = pathlib.Path(
os.getenv("CHUNK_STORAGE_DIR", FILE_STORAGE_DIR / "chunks")
)
CHUNK_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
# Maximum attachment size to store directly in the database (10MB)
MAX_INLINE_ATTACHMENT_SIZE = int(os.getenv("MAX_INLINE_ATTACHMENT_SIZE", 1 * 1024 * 1024))
MAX_INLINE_ATTACHMENT_SIZE = int(
os.getenv("MAX_INLINE_ATTACHMENT_SIZE", 1 * 1024 * 1024)
)
# Qdrant settings
QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
QDRANT_GRPC_PORT = int(os.getenv("QDRANT_GRPC_PORT", "6334"))
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", None)
QDRANT_PREFER_GRPC = boolean_env("QDRANT_PREFER_GRPC", False)
QDRANT_PREFER_GRPC = boolean_env("QDRANT_PREFER_GRPC", False)
QDRANT_TIMEOUT = int(os.getenv("QDRANT_TIMEOUT", "60"))

View File

@ -55,7 +55,7 @@ def drop_test_database(test_db_name: str) -> None:
with admin_engine.connect() as conn:
conn.execute(text("COMMIT")) # Close any open transaction
# Terminate all connections to the database
conn.execute(
text(
@ -67,10 +67,10 @@ def drop_test_database(test_db_name: str) -> None:
"""
)
)
# Drop the database
conn.execute(text(f"DROP DATABASE IF EXISTS {test_db_name}"))
admin_engine.dispose()
@ -199,8 +199,11 @@ def email_provider():
@pytest.fixture(autouse=True)
def mock_file_storage(tmp_path: Path):
chunk_storage_dir = tmp_path / "chunks"
chunk_storage_dir.mkdir(parents=True, exist_ok=True)
with patch("memory.common.settings.FILE_STORAGE_DIR", tmp_path):
yield
with patch("memory.common.settings.CHUNK_STORAGE_DIR", chunk_storage_dir):
yield
@pytest.fixture
@ -215,4 +218,4 @@ def qdrant():
@pytest.fixture(autouse=True)
def mock_voyage_client():
with patch.object(voyageai, "Client", autospec=True) as mock_client:
yield mock_client()
yield mock_client()

View File

@ -1,18 +1,20 @@
import uuid
import pytest
from unittest.mock import Mock, patch
from PIL import Image
import pathlib
import uuid
from unittest.mock import Mock, patch
import pytest
from PIL import Image
from memory.common import settings
from memory.common.embedding import (
get_modality,
embed_text,
embed,
embed_file,
embed_mixed,
embed_page,
embed,
write_to_file,
embed_text,
get_modality,
make_chunk,
write_to_file,
)
@ -116,7 +118,7 @@ def test_write_to_file_text(mock_file_storage):
file_path = write_to_file(chunk_id, content)
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.txt"
assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.txt"
assert file_path.exists()
assert file_path.read_text() == content
@ -128,7 +130,7 @@ def test_write_to_file_bytes(mock_file_storage):
file_path = write_to_file(chunk_id, content)
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.bin"
assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.bin"
assert file_path.exists()
assert file_path.read_bytes() == content
@ -140,7 +142,7 @@ def test_write_to_file_image(mock_file_storage):
file_path = write_to_file(chunk_id, img)
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.png"
assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.png"
assert file_path.exists()
# Verify it's a valid image file by opening it
image = Image.open(file_path)
@ -192,8 +194,8 @@ def test_make_chunk_single_image(mock_file_storage, db_session):
assert chunk.id == "00000000-0000-0000-0000-000000000002"
assert chunk.content is None
assert chunk.file_path == (
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000002.png",
assert chunk.file_path == str(
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000002.png",
)
assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL
assert chunk.vector == vector
@ -217,8 +219,8 @@ def test_make_chunk_mixed_content(mock_file_storage, db_session):
assert chunk.id == "00000000-0000-0000-0000-000000000003"
assert chunk.content is None
assert chunk.file_path == (
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_*",
assert chunk.file_path == str(
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_*",
)
assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL
assert chunk.vector == vector
@ -226,8 +228,8 @@ def test_make_chunk_mixed_content(mock_file_storage, db_session):
# Verify the files exist
assert (
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_0.txt"
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_0.txt"
).exists()
assert (
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_1.png"
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_1.png"
).exists()