mirror of
https://github.com/mruwnik/memory.git
synced 2025-07-30 22:56:08 +02:00
proper chunks path
This commit is contained in:
parent
44de394eb1
commit
c6cd809eb7
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,3 +1,4 @@
|
||||
memory_files
|
||||
.env
|
||||
.DS_Store
|
||||
secrets/
|
||||
|
@ -2,7 +2,8 @@ version: "3.9"
|
||||
|
||||
# --------------------------------------------------------------------- networks
|
||||
networks:
|
||||
kbnet: # internal overlay – NOT exposed
|
||||
kbnet:
|
||||
# internal overlay – NOT exposed
|
||||
driver: bridge
|
||||
|
||||
# --------------------------------------------------------------------- secrets
|
||||
@ -16,7 +17,6 @@ volumes:
|
||||
db_data: {} # Postgres
|
||||
qdrant_data: {} # Qdrant
|
||||
rabbitmq_data: {} # RabbitMQ
|
||||
file_storage: {} # File storage
|
||||
|
||||
# ------------------------------ X-templates ----------------------------
|
||||
x-common-env: &env
|
||||
@ -26,7 +26,6 @@ x-common-env: &env
|
||||
FILE_STORAGE_DIR: /app/memory_files
|
||||
TZ: "Etc/UTC"
|
||||
|
||||
|
||||
x-worker-base: &worker-base
|
||||
build:
|
||||
context: .
|
||||
@ -47,9 +46,10 @@ x-worker-base: &worker-base
|
||||
tmpfs: [ /tmp, /var/tmp ]
|
||||
cap_drop: [ ALL ]
|
||||
volumes:
|
||||
- file_storage:/app/memory_files:rw
|
||||
- ./memory_files:/app/memory_files:rw
|
||||
logging:
|
||||
options: { max-size: "10m", max-file: "3" }
|
||||
user: kb
|
||||
|
||||
# ================================ SERVICES ============================
|
||||
|
||||
@ -94,7 +94,8 @@ services:
|
||||
mem_limit: 512m
|
||||
cpus: "0.5"
|
||||
security_opt: [ "no-new-privileges=true" ]
|
||||
ports: # UI only on localhost
|
||||
ports:
|
||||
# UI only on localhost
|
||||
- "127.0.0.1:15672:15672"
|
||||
|
||||
qdrant:
|
||||
@ -224,7 +225,7 @@ services:
|
||||
environment:
|
||||
<<: *worker-env
|
||||
volumes:
|
||||
- file_storage:/app/memory_files:rw
|
||||
- ./memory_files:/app/memory_files:rw
|
||||
tmpfs:
|
||||
- /tmp
|
||||
- /var/tmp
|
||||
|
@ -21,8 +21,11 @@ RUN chmod +x entry.sh
|
||||
# Create required tmpfs directories for supervisor
|
||||
RUN mkdir -p /var/log/supervisor /var/run/supervisor
|
||||
|
||||
# Create storage directory
|
||||
RUN mkdir -p /app/memory_files
|
||||
|
||||
# Create user and set permissions
|
||||
RUN useradd -m kb && chown -R kb /app /var/log/supervisor /var/run/supervisor
|
||||
RUN useradd -m kb && chown -R kb /app /var/log/supervisor /var/run/supervisor /app/memory_files
|
||||
USER kb
|
||||
|
||||
# Default queues to process
|
||||
|
@ -17,6 +17,8 @@ RUN apt-get update && apt-get install -y \
|
||||
COPY docker/workers/entry.sh ./entry.sh
|
||||
RUN chmod +x entry.sh
|
||||
|
||||
RUN mkdir -p /app/memory_files
|
||||
|
||||
# Create user and set permissions
|
||||
RUN useradd -m kb && chown -R kb /app
|
||||
USER kb
|
||||
|
@ -128,13 +128,13 @@ def embed_page(page: dict[str, Any]) -> list[Vector]:
|
||||
|
||||
def write_to_file(chunk_id: str, item: extract.MulitmodalChunk) -> pathlib.Path:
|
||||
if isinstance(item, str):
|
||||
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.txt"
|
||||
filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.txt"
|
||||
filename.write_text(item)
|
||||
elif isinstance(item, bytes):
|
||||
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.bin"
|
||||
filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.bin"
|
||||
filename.write_bytes(item)
|
||||
elif isinstance(item, Image.Image):
|
||||
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.png"
|
||||
filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.png"
|
||||
item.save(filename)
|
||||
else:
|
||||
raise ValueError(f"Unsupported content type: {type(item)}")
|
||||
@ -156,13 +156,13 @@ def make_chunk(
|
||||
content = "\n\n".join(contents)
|
||||
model = settings.TEXT_EMBEDDING_MODEL
|
||||
elif len(contents) == 1:
|
||||
filename = (write_to_file(chunk_id, contents[0]),)
|
||||
filename = write_to_file(chunk_id, contents[0]).absolute().as_posix()
|
||||
model = settings.MIXED_EMBEDDING_MODEL
|
||||
else:
|
||||
for i, item in enumerate(contents):
|
||||
write_to_file(f"{chunk_id}_{i}", item)
|
||||
model = settings.MIXED_EMBEDDING_MODEL
|
||||
filename = (settings.FILE_STORAGE_DIR / f"{chunk_id}_*",)
|
||||
filename = (settings.CHUNK_STORAGE_DIR / f"{chunk_id}_*").absolute().as_posix()
|
||||
|
||||
return Chunk(
|
||||
id=chunk_id,
|
||||
|
@ -4,9 +4,11 @@ from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
def boolean_env(key: str, default: bool = False) -> bool:
|
||||
return os.getenv(key, "0").lower() in ("1", "true", "yes")
|
||||
|
||||
|
||||
# Database settings
|
||||
DB_USER = os.getenv("DB_USER", "kb")
|
||||
if password_file := os.getenv("POSTGRES_PASSWORD_FILE"):
|
||||
@ -18,16 +20,26 @@ DB_HOST = os.getenv("DB_HOST", "postgres")
|
||||
DB_PORT = os.getenv("DB_PORT", "5432")
|
||||
DB_NAME = os.getenv("DB_NAME", "kb")
|
||||
|
||||
def make_db_url(user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT, db=DB_NAME):
|
||||
|
||||
def make_db_url(
|
||||
user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT, db=DB_NAME
|
||||
):
|
||||
return f"postgresql://{user}:{password}@{host}:{port}/{db}"
|
||||
|
||||
|
||||
DB_URL = os.getenv("DATABASE_URL", make_db_url())
|
||||
|
||||
|
||||
FILE_STORAGE_DIR = pathlib.Path(os.getenv("FILE_STORAGE_DIR", "/tmp/memory_files"))
|
||||
FILE_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
CHUNK_STORAGE_DIR = pathlib.Path(
|
||||
os.getenv("CHUNK_STORAGE_DIR", FILE_STORAGE_DIR / "chunks")
|
||||
)
|
||||
CHUNK_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
# Maximum attachment size to store directly in the database (10MB)
|
||||
MAX_INLINE_ATTACHMENT_SIZE = int(os.getenv("MAX_INLINE_ATTACHMENT_SIZE", 1 * 1024 * 1024))
|
||||
MAX_INLINE_ATTACHMENT_SIZE = int(
|
||||
os.getenv("MAX_INLINE_ATTACHMENT_SIZE", 1 * 1024 * 1024)
|
||||
)
|
||||
|
||||
# Qdrant settings
|
||||
QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant")
|
||||
|
@ -199,7 +199,10 @@ def email_provider():
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_file_storage(tmp_path: Path):
|
||||
chunk_storage_dir = tmp_path / "chunks"
|
||||
chunk_storage_dir.mkdir(parents=True, exist_ok=True)
|
||||
with patch("memory.common.settings.FILE_STORAGE_DIR", tmp_path):
|
||||
with patch("memory.common.settings.CHUNK_STORAGE_DIR", chunk_storage_dir):
|
||||
yield
|
||||
|
||||
|
||||
|
@ -1,18 +1,20 @@
|
||||
import uuid
|
||||
import pytest
|
||||
from unittest.mock import Mock, patch
|
||||
from PIL import Image
|
||||
import pathlib
|
||||
import uuid
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from memory.common import settings
|
||||
from memory.common.embedding import (
|
||||
get_modality,
|
||||
embed_text,
|
||||
embed,
|
||||
embed_file,
|
||||
embed_mixed,
|
||||
embed_page,
|
||||
embed,
|
||||
write_to_file,
|
||||
embed_text,
|
||||
get_modality,
|
||||
make_chunk,
|
||||
write_to_file,
|
||||
)
|
||||
|
||||
|
||||
@ -116,7 +118,7 @@ def test_write_to_file_text(mock_file_storage):
|
||||
|
||||
file_path = write_to_file(chunk_id, content)
|
||||
|
||||
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.txt"
|
||||
assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.txt"
|
||||
assert file_path.exists()
|
||||
assert file_path.read_text() == content
|
||||
|
||||
@ -128,7 +130,7 @@ def test_write_to_file_bytes(mock_file_storage):
|
||||
|
||||
file_path = write_to_file(chunk_id, content)
|
||||
|
||||
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.bin"
|
||||
assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.bin"
|
||||
assert file_path.exists()
|
||||
assert file_path.read_bytes() == content
|
||||
|
||||
@ -140,7 +142,7 @@ def test_write_to_file_image(mock_file_storage):
|
||||
|
||||
file_path = write_to_file(chunk_id, img)
|
||||
|
||||
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.png"
|
||||
assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.png"
|
||||
assert file_path.exists()
|
||||
# Verify it's a valid image file by opening it
|
||||
image = Image.open(file_path)
|
||||
@ -192,8 +194,8 @@ def test_make_chunk_single_image(mock_file_storage, db_session):
|
||||
|
||||
assert chunk.id == "00000000-0000-0000-0000-000000000002"
|
||||
assert chunk.content is None
|
||||
assert chunk.file_path == (
|
||||
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000002.png",
|
||||
assert chunk.file_path == str(
|
||||
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000002.png",
|
||||
)
|
||||
assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL
|
||||
assert chunk.vector == vector
|
||||
@ -217,8 +219,8 @@ def test_make_chunk_mixed_content(mock_file_storage, db_session):
|
||||
|
||||
assert chunk.id == "00000000-0000-0000-0000-000000000003"
|
||||
assert chunk.content is None
|
||||
assert chunk.file_path == (
|
||||
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_*",
|
||||
assert chunk.file_path == str(
|
||||
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_*",
|
||||
)
|
||||
assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL
|
||||
assert chunk.vector == vector
|
||||
@ -226,8 +228,8 @@ def test_make_chunk_mixed_content(mock_file_storage, db_session):
|
||||
|
||||
# Verify the files exist
|
||||
assert (
|
||||
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_0.txt"
|
||||
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_0.txt"
|
||||
).exists()
|
||||
assert (
|
||||
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_1.png"
|
||||
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_1.png"
|
||||
).exists()
|
||||
|
Loading…
x
Reference in New Issue
Block a user