proper chunks path

This commit is contained in:
Daniel O'Connell 2025-05-03 16:21:07 +02:00
parent 44de394eb1
commit c6cd809eb7
8 changed files with 106 additions and 82 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
memory_files
.env .env
.DS_Store .DS_Store
secrets/ secrets/

View File

@ -2,7 +2,8 @@ version: "3.9"
# --------------------------------------------------------------------- networks # --------------------------------------------------------------------- networks
networks: networks:
kbnet: # internal overlay NOT exposed kbnet:
# internal overlay NOT exposed
driver: bridge driver: bridge
# --------------------------------------------------------------------- secrets # --------------------------------------------------------------------- secrets
@ -16,7 +17,6 @@ volumes:
db_data: {} # Postgres db_data: {} # Postgres
qdrant_data: {} # Qdrant qdrant_data: {} # Qdrant
rabbitmq_data: {} # RabbitMQ rabbitmq_data: {} # RabbitMQ
file_storage: {} # File storage
# ------------------------------ X-templates ---------------------------- # ------------------------------ X-templates ----------------------------
x-common-env: &env x-common-env: &env
@ -26,7 +26,6 @@ x-common-env: &env
FILE_STORAGE_DIR: /app/memory_files FILE_STORAGE_DIR: /app/memory_files
TZ: "Etc/UTC" TZ: "Etc/UTC"
x-worker-base: &worker-base x-worker-base: &worker-base
build: build:
context: . context: .
@ -47,9 +46,10 @@ x-worker-base: &worker-base
tmpfs: [ /tmp, /var/tmp ] tmpfs: [ /tmp, /var/tmp ]
cap_drop: [ ALL ] cap_drop: [ ALL ]
volumes: volumes:
- file_storage:/app/memory_files:rw - ./memory_files:/app/memory_files:rw
logging: logging:
options: { max-size: "10m", max-file: "3" } options: { max-size: "10m", max-file: "3" }
user: kb
# ================================ SERVICES ============================ # ================================ SERVICES ============================
@ -94,7 +94,8 @@ services:
mem_limit: 512m mem_limit: 512m
cpus: "0.5" cpus: "0.5"
security_opt: [ "no-new-privileges=true" ] security_opt: [ "no-new-privileges=true" ]
ports: # UI only on localhost ports:
# UI only on localhost
- "127.0.0.1:15672:15672" - "127.0.0.1:15672:15672"
qdrant: qdrant:
@ -224,7 +225,7 @@ services:
environment: environment:
<<: *worker-env <<: *worker-env
volumes: volumes:
- file_storage:/app/memory_files:rw - ./memory_files:/app/memory_files:rw
tmpfs: tmpfs:
- /tmp - /tmp
- /var/tmp - /var/tmp

View File

@ -21,8 +21,11 @@ RUN chmod +x entry.sh
# Create required tmpfs directories for supervisor # Create required tmpfs directories for supervisor
RUN mkdir -p /var/log/supervisor /var/run/supervisor RUN mkdir -p /var/log/supervisor /var/run/supervisor
# Create storage directory
RUN mkdir -p /app/memory_files
# Create user and set permissions # Create user and set permissions
RUN useradd -m kb && chown -R kb /app /var/log/supervisor /var/run/supervisor RUN useradd -m kb && chown -R kb /app /var/log/supervisor /var/run/supervisor /app/memory_files
USER kb USER kb
# Default queues to process # Default queues to process

View File

@ -17,6 +17,8 @@ RUN apt-get update && apt-get install -y \
COPY docker/workers/entry.sh ./entry.sh COPY docker/workers/entry.sh ./entry.sh
RUN chmod +x entry.sh RUN chmod +x entry.sh
RUN mkdir -p /app/memory_files
# Create user and set permissions # Create user and set permissions
RUN useradd -m kb && chown -R kb /app RUN useradd -m kb && chown -R kb /app
USER kb USER kb

View File

@ -128,13 +128,13 @@ def embed_page(page: dict[str, Any]) -> list[Vector]:
def write_to_file(chunk_id: str, item: extract.MulitmodalChunk) -> pathlib.Path: def write_to_file(chunk_id: str, item: extract.MulitmodalChunk) -> pathlib.Path:
if isinstance(item, str): if isinstance(item, str):
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.txt" filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.txt"
filename.write_text(item) filename.write_text(item)
elif isinstance(item, bytes): elif isinstance(item, bytes):
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.bin" filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.bin"
filename.write_bytes(item) filename.write_bytes(item)
elif isinstance(item, Image.Image): elif isinstance(item, Image.Image):
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.png" filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.png"
item.save(filename) item.save(filename)
else: else:
raise ValueError(f"Unsupported content type: {type(item)}") raise ValueError(f"Unsupported content type: {type(item)}")
@ -156,13 +156,13 @@ def make_chunk(
content = "\n\n".join(contents) content = "\n\n".join(contents)
model = settings.TEXT_EMBEDDING_MODEL model = settings.TEXT_EMBEDDING_MODEL
elif len(contents) == 1: elif len(contents) == 1:
filename = (write_to_file(chunk_id, contents[0]),) filename = write_to_file(chunk_id, contents[0]).absolute().as_posix()
model = settings.MIXED_EMBEDDING_MODEL model = settings.MIXED_EMBEDDING_MODEL
else: else:
for i, item in enumerate(contents): for i, item in enumerate(contents):
write_to_file(f"{chunk_id}_{i}", item) write_to_file(f"{chunk_id}_{i}", item)
model = settings.MIXED_EMBEDDING_MODEL model = settings.MIXED_EMBEDDING_MODEL
filename = (settings.FILE_STORAGE_DIR / f"{chunk_id}_*",) filename = (settings.CHUNK_STORAGE_DIR / f"{chunk_id}_*").absolute().as_posix()
return Chunk( return Chunk(
id=chunk_id, id=chunk_id,

View File

@ -4,9 +4,11 @@ from dotenv import load_dotenv
load_dotenv() load_dotenv()
def boolean_env(key: str, default: bool = False) -> bool: def boolean_env(key: str, default: bool = False) -> bool:
return os.getenv(key, "0").lower() in ("1", "true", "yes") return os.getenv(key, "0").lower() in ("1", "true", "yes")
# Database settings # Database settings
DB_USER = os.getenv("DB_USER", "kb") DB_USER = os.getenv("DB_USER", "kb")
if password_file := os.getenv("POSTGRES_PASSWORD_FILE"): if password_file := os.getenv("POSTGRES_PASSWORD_FILE"):
@ -18,16 +20,26 @@ DB_HOST = os.getenv("DB_HOST", "postgres")
DB_PORT = os.getenv("DB_PORT", "5432") DB_PORT = os.getenv("DB_PORT", "5432")
DB_NAME = os.getenv("DB_NAME", "kb") DB_NAME = os.getenv("DB_NAME", "kb")
def make_db_url(user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT, db=DB_NAME):
def make_db_url(
user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT, db=DB_NAME
):
return f"postgresql://{user}:{password}@{host}:{port}/{db}" return f"postgresql://{user}:{password}@{host}:{port}/{db}"
DB_URL = os.getenv("DATABASE_URL", make_db_url()) DB_URL = os.getenv("DATABASE_URL", make_db_url())
FILE_STORAGE_DIR = pathlib.Path(os.getenv("FILE_STORAGE_DIR", "/tmp/memory_files")) FILE_STORAGE_DIR = pathlib.Path(os.getenv("FILE_STORAGE_DIR", "/tmp/memory_files"))
FILE_STORAGE_DIR.mkdir(parents=True, exist_ok=True) FILE_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
CHUNK_STORAGE_DIR = pathlib.Path(
os.getenv("CHUNK_STORAGE_DIR", FILE_STORAGE_DIR / "chunks")
)
CHUNK_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
# Maximum attachment size to store directly in the database (10MB) # Maximum attachment size to store directly in the database (10MB)
MAX_INLINE_ATTACHMENT_SIZE = int(os.getenv("MAX_INLINE_ATTACHMENT_SIZE", 1 * 1024 * 1024)) MAX_INLINE_ATTACHMENT_SIZE = int(
os.getenv("MAX_INLINE_ATTACHMENT_SIZE", 1 * 1024 * 1024)
)
# Qdrant settings # Qdrant settings
QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant") QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant")

View File

@ -199,7 +199,10 @@ def email_provider():
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def mock_file_storage(tmp_path: Path): def mock_file_storage(tmp_path: Path):
chunk_storage_dir = tmp_path / "chunks"
chunk_storage_dir.mkdir(parents=True, exist_ok=True)
with patch("memory.common.settings.FILE_STORAGE_DIR", tmp_path): with patch("memory.common.settings.FILE_STORAGE_DIR", tmp_path):
with patch("memory.common.settings.CHUNK_STORAGE_DIR", chunk_storage_dir):
yield yield

View File

@ -1,18 +1,20 @@
import uuid
import pytest
from unittest.mock import Mock, patch
from PIL import Image
import pathlib import pathlib
import uuid
from unittest.mock import Mock, patch
import pytest
from PIL import Image
from memory.common import settings from memory.common import settings
from memory.common.embedding import ( from memory.common.embedding import (
get_modality, embed,
embed_text,
embed_file, embed_file,
embed_mixed, embed_mixed,
embed_page, embed_page,
embed, embed_text,
write_to_file, get_modality,
make_chunk, make_chunk,
write_to_file,
) )
@ -116,7 +118,7 @@ def test_write_to_file_text(mock_file_storage):
file_path = write_to_file(chunk_id, content) file_path = write_to_file(chunk_id, content)
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.txt" assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.txt"
assert file_path.exists() assert file_path.exists()
assert file_path.read_text() == content assert file_path.read_text() == content
@ -128,7 +130,7 @@ def test_write_to_file_bytes(mock_file_storage):
file_path = write_to_file(chunk_id, content) file_path = write_to_file(chunk_id, content)
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.bin" assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.bin"
assert file_path.exists() assert file_path.exists()
assert file_path.read_bytes() == content assert file_path.read_bytes() == content
@ -140,7 +142,7 @@ def test_write_to_file_image(mock_file_storage):
file_path = write_to_file(chunk_id, img) file_path = write_to_file(chunk_id, img)
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.png" assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.png"
assert file_path.exists() assert file_path.exists()
# Verify it's a valid image file by opening it # Verify it's a valid image file by opening it
image = Image.open(file_path) image = Image.open(file_path)
@ -192,8 +194,8 @@ def test_make_chunk_single_image(mock_file_storage, db_session):
assert chunk.id == "00000000-0000-0000-0000-000000000002" assert chunk.id == "00000000-0000-0000-0000-000000000002"
assert chunk.content is None assert chunk.content is None
assert chunk.file_path == ( assert chunk.file_path == str(
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000002.png", settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000002.png",
) )
assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL
assert chunk.vector == vector assert chunk.vector == vector
@ -217,8 +219,8 @@ def test_make_chunk_mixed_content(mock_file_storage, db_session):
assert chunk.id == "00000000-0000-0000-0000-000000000003" assert chunk.id == "00000000-0000-0000-0000-000000000003"
assert chunk.content is None assert chunk.content is None
assert chunk.file_path == ( assert chunk.file_path == str(
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_*", settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_*",
) )
assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL
assert chunk.vector == vector assert chunk.vector == vector
@ -226,8 +228,8 @@ def test_make_chunk_mixed_content(mock_file_storage, db_session):
# Verify the files exist # Verify the files exist
assert ( assert (
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_0.txt" settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_0.txt"
).exists() ).exists()
assert ( assert (
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_1.png" settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_1.png"
).exists() ).exists()