mirror of
https://github.com/mruwnik/memory.git
synced 2025-06-08 13:24:41 +02:00
proper chunks path
This commit is contained in:
parent
44de394eb1
commit
c6cd809eb7
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,3 +1,4 @@
|
|||||||
|
memory_files
|
||||||
.env
|
.env
|
||||||
.DS_Store
|
.DS_Store
|
||||||
secrets/
|
secrets/
|
||||||
|
@ -2,7 +2,8 @@ version: "3.9"
|
|||||||
|
|
||||||
# --------------------------------------------------------------------- networks
|
# --------------------------------------------------------------------- networks
|
||||||
networks:
|
networks:
|
||||||
kbnet: # internal overlay – NOT exposed
|
kbnet:
|
||||||
|
# internal overlay – NOT exposed
|
||||||
driver: bridge
|
driver: bridge
|
||||||
|
|
||||||
# --------------------------------------------------------------------- secrets
|
# --------------------------------------------------------------------- secrets
|
||||||
@ -16,7 +17,6 @@ volumes:
|
|||||||
db_data: {} # Postgres
|
db_data: {} # Postgres
|
||||||
qdrant_data: {} # Qdrant
|
qdrant_data: {} # Qdrant
|
||||||
rabbitmq_data: {} # RabbitMQ
|
rabbitmq_data: {} # RabbitMQ
|
||||||
file_storage: {} # File storage
|
|
||||||
|
|
||||||
# ------------------------------ X-templates ----------------------------
|
# ------------------------------ X-templates ----------------------------
|
||||||
x-common-env: &env
|
x-common-env: &env
|
||||||
@ -26,7 +26,6 @@ x-common-env: &env
|
|||||||
FILE_STORAGE_DIR: /app/memory_files
|
FILE_STORAGE_DIR: /app/memory_files
|
||||||
TZ: "Etc/UTC"
|
TZ: "Etc/UTC"
|
||||||
|
|
||||||
|
|
||||||
x-worker-base: &worker-base
|
x-worker-base: &worker-base
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
@ -47,9 +46,10 @@ x-worker-base: &worker-base
|
|||||||
tmpfs: [ /tmp, /var/tmp ]
|
tmpfs: [ /tmp, /var/tmp ]
|
||||||
cap_drop: [ ALL ]
|
cap_drop: [ ALL ]
|
||||||
volumes:
|
volumes:
|
||||||
- file_storage:/app/memory_files:rw
|
- ./memory_files:/app/memory_files:rw
|
||||||
logging:
|
logging:
|
||||||
options: { max-size: "10m", max-file: "3" }
|
options: { max-size: "10m", max-file: "3" }
|
||||||
|
user: kb
|
||||||
|
|
||||||
# ================================ SERVICES ============================
|
# ================================ SERVICES ============================
|
||||||
|
|
||||||
@ -94,7 +94,8 @@ services:
|
|||||||
mem_limit: 512m
|
mem_limit: 512m
|
||||||
cpus: "0.5"
|
cpus: "0.5"
|
||||||
security_opt: [ "no-new-privileges=true" ]
|
security_opt: [ "no-new-privileges=true" ]
|
||||||
ports: # UI only on localhost
|
ports:
|
||||||
|
# UI only on localhost
|
||||||
- "127.0.0.1:15672:15672"
|
- "127.0.0.1:15672:15672"
|
||||||
|
|
||||||
qdrant:
|
qdrant:
|
||||||
@ -224,7 +225,7 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
<<: *worker-env
|
<<: *worker-env
|
||||||
volumes:
|
volumes:
|
||||||
- file_storage:/app/memory_files:rw
|
- ./memory_files:/app/memory_files:rw
|
||||||
tmpfs:
|
tmpfs:
|
||||||
- /tmp
|
- /tmp
|
||||||
- /var/tmp
|
- /var/tmp
|
||||||
|
@ -21,8 +21,11 @@ RUN chmod +x entry.sh
|
|||||||
# Create required tmpfs directories for supervisor
|
# Create required tmpfs directories for supervisor
|
||||||
RUN mkdir -p /var/log/supervisor /var/run/supervisor
|
RUN mkdir -p /var/log/supervisor /var/run/supervisor
|
||||||
|
|
||||||
|
# Create storage directory
|
||||||
|
RUN mkdir -p /app/memory_files
|
||||||
|
|
||||||
# Create user and set permissions
|
# Create user and set permissions
|
||||||
RUN useradd -m kb && chown -R kb /app /var/log/supervisor /var/run/supervisor
|
RUN useradd -m kb && chown -R kb /app /var/log/supervisor /var/run/supervisor /app/memory_files
|
||||||
USER kb
|
USER kb
|
||||||
|
|
||||||
# Default queues to process
|
# Default queues to process
|
||||||
|
@ -17,6 +17,8 @@ RUN apt-get update && apt-get install -y \
|
|||||||
COPY docker/workers/entry.sh ./entry.sh
|
COPY docker/workers/entry.sh ./entry.sh
|
||||||
RUN chmod +x entry.sh
|
RUN chmod +x entry.sh
|
||||||
|
|
||||||
|
RUN mkdir -p /app/memory_files
|
||||||
|
|
||||||
# Create user and set permissions
|
# Create user and set permissions
|
||||||
RUN useradd -m kb && chown -R kb /app
|
RUN useradd -m kb && chown -R kb /app
|
||||||
USER kb
|
USER kb
|
||||||
|
@ -128,13 +128,13 @@ def embed_page(page: dict[str, Any]) -> list[Vector]:
|
|||||||
|
|
||||||
def write_to_file(chunk_id: str, item: extract.MulitmodalChunk) -> pathlib.Path:
|
def write_to_file(chunk_id: str, item: extract.MulitmodalChunk) -> pathlib.Path:
|
||||||
if isinstance(item, str):
|
if isinstance(item, str):
|
||||||
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.txt"
|
filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.txt"
|
||||||
filename.write_text(item)
|
filename.write_text(item)
|
||||||
elif isinstance(item, bytes):
|
elif isinstance(item, bytes):
|
||||||
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.bin"
|
filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.bin"
|
||||||
filename.write_bytes(item)
|
filename.write_bytes(item)
|
||||||
elif isinstance(item, Image.Image):
|
elif isinstance(item, Image.Image):
|
||||||
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.png"
|
filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.png"
|
||||||
item.save(filename)
|
item.save(filename)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported content type: {type(item)}")
|
raise ValueError(f"Unsupported content type: {type(item)}")
|
||||||
@ -156,13 +156,13 @@ def make_chunk(
|
|||||||
content = "\n\n".join(contents)
|
content = "\n\n".join(contents)
|
||||||
model = settings.TEXT_EMBEDDING_MODEL
|
model = settings.TEXT_EMBEDDING_MODEL
|
||||||
elif len(contents) == 1:
|
elif len(contents) == 1:
|
||||||
filename = (write_to_file(chunk_id, contents[0]),)
|
filename = write_to_file(chunk_id, contents[0]).absolute().as_posix()
|
||||||
model = settings.MIXED_EMBEDDING_MODEL
|
model = settings.MIXED_EMBEDDING_MODEL
|
||||||
else:
|
else:
|
||||||
for i, item in enumerate(contents):
|
for i, item in enumerate(contents):
|
||||||
write_to_file(f"{chunk_id}_{i}", item)
|
write_to_file(f"{chunk_id}_{i}", item)
|
||||||
model = settings.MIXED_EMBEDDING_MODEL
|
model = settings.MIXED_EMBEDDING_MODEL
|
||||||
filename = (settings.FILE_STORAGE_DIR / f"{chunk_id}_*",)
|
filename = (settings.CHUNK_STORAGE_DIR / f"{chunk_id}_*").absolute().as_posix()
|
||||||
|
|
||||||
return Chunk(
|
return Chunk(
|
||||||
id=chunk_id,
|
id=chunk_id,
|
||||||
|
@ -4,9 +4,11 @@ from dotenv import load_dotenv
|
|||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
def boolean_env(key: str, default: bool = False) -> bool:
|
def boolean_env(key: str, default: bool = False) -> bool:
|
||||||
return os.getenv(key, "0").lower() in ("1", "true", "yes")
|
return os.getenv(key, "0").lower() in ("1", "true", "yes")
|
||||||
|
|
||||||
|
|
||||||
# Database settings
|
# Database settings
|
||||||
DB_USER = os.getenv("DB_USER", "kb")
|
DB_USER = os.getenv("DB_USER", "kb")
|
||||||
if password_file := os.getenv("POSTGRES_PASSWORD_FILE"):
|
if password_file := os.getenv("POSTGRES_PASSWORD_FILE"):
|
||||||
@ -18,16 +20,26 @@ DB_HOST = os.getenv("DB_HOST", "postgres")
|
|||||||
DB_PORT = os.getenv("DB_PORT", "5432")
|
DB_PORT = os.getenv("DB_PORT", "5432")
|
||||||
DB_NAME = os.getenv("DB_NAME", "kb")
|
DB_NAME = os.getenv("DB_NAME", "kb")
|
||||||
|
|
||||||
def make_db_url(user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT, db=DB_NAME):
|
|
||||||
|
def make_db_url(
|
||||||
|
user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT, db=DB_NAME
|
||||||
|
):
|
||||||
return f"postgresql://{user}:{password}@{host}:{port}/{db}"
|
return f"postgresql://{user}:{password}@{host}:{port}/{db}"
|
||||||
|
|
||||||
|
|
||||||
DB_URL = os.getenv("DATABASE_URL", make_db_url())
|
DB_URL = os.getenv("DATABASE_URL", make_db_url())
|
||||||
|
|
||||||
|
|
||||||
FILE_STORAGE_DIR = pathlib.Path(os.getenv("FILE_STORAGE_DIR", "/tmp/memory_files"))
|
FILE_STORAGE_DIR = pathlib.Path(os.getenv("FILE_STORAGE_DIR", "/tmp/memory_files"))
|
||||||
FILE_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
|
FILE_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
CHUNK_STORAGE_DIR = pathlib.Path(
|
||||||
|
os.getenv("CHUNK_STORAGE_DIR", FILE_STORAGE_DIR / "chunks")
|
||||||
|
)
|
||||||
|
CHUNK_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
# Maximum attachment size to store directly in the database (10MB)
|
# Maximum attachment size to store directly in the database (10MB)
|
||||||
MAX_INLINE_ATTACHMENT_SIZE = int(os.getenv("MAX_INLINE_ATTACHMENT_SIZE", 1 * 1024 * 1024))
|
MAX_INLINE_ATTACHMENT_SIZE = int(
|
||||||
|
os.getenv("MAX_INLINE_ATTACHMENT_SIZE", 1 * 1024 * 1024)
|
||||||
|
)
|
||||||
|
|
||||||
# Qdrant settings
|
# Qdrant settings
|
||||||
QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant")
|
QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant")
|
||||||
|
@ -199,7 +199,10 @@ def email_provider():
|
|||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def mock_file_storage(tmp_path: Path):
|
def mock_file_storage(tmp_path: Path):
|
||||||
|
chunk_storage_dir = tmp_path / "chunks"
|
||||||
|
chunk_storage_dir.mkdir(parents=True, exist_ok=True)
|
||||||
with patch("memory.common.settings.FILE_STORAGE_DIR", tmp_path):
|
with patch("memory.common.settings.FILE_STORAGE_DIR", tmp_path):
|
||||||
|
with patch("memory.common.settings.CHUNK_STORAGE_DIR", chunk_storage_dir):
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,18 +1,20 @@
|
|||||||
import uuid
|
|
||||||
import pytest
|
|
||||||
from unittest.mock import Mock, patch
|
|
||||||
from PIL import Image
|
|
||||||
import pathlib
|
import pathlib
|
||||||
|
import uuid
|
||||||
|
from unittest.mock import Mock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
from memory.common import settings
|
from memory.common import settings
|
||||||
from memory.common.embedding import (
|
from memory.common.embedding import (
|
||||||
get_modality,
|
embed,
|
||||||
embed_text,
|
|
||||||
embed_file,
|
embed_file,
|
||||||
embed_mixed,
|
embed_mixed,
|
||||||
embed_page,
|
embed_page,
|
||||||
embed,
|
embed_text,
|
||||||
write_to_file,
|
get_modality,
|
||||||
make_chunk,
|
make_chunk,
|
||||||
|
write_to_file,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -116,7 +118,7 @@ def test_write_to_file_text(mock_file_storage):
|
|||||||
|
|
||||||
file_path = write_to_file(chunk_id, content)
|
file_path = write_to_file(chunk_id, content)
|
||||||
|
|
||||||
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.txt"
|
assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.txt"
|
||||||
assert file_path.exists()
|
assert file_path.exists()
|
||||||
assert file_path.read_text() == content
|
assert file_path.read_text() == content
|
||||||
|
|
||||||
@ -128,7 +130,7 @@ def test_write_to_file_bytes(mock_file_storage):
|
|||||||
|
|
||||||
file_path = write_to_file(chunk_id, content)
|
file_path = write_to_file(chunk_id, content)
|
||||||
|
|
||||||
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.bin"
|
assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.bin"
|
||||||
assert file_path.exists()
|
assert file_path.exists()
|
||||||
assert file_path.read_bytes() == content
|
assert file_path.read_bytes() == content
|
||||||
|
|
||||||
@ -140,7 +142,7 @@ def test_write_to_file_image(mock_file_storage):
|
|||||||
|
|
||||||
file_path = write_to_file(chunk_id, img)
|
file_path = write_to_file(chunk_id, img)
|
||||||
|
|
||||||
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.png"
|
assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.png"
|
||||||
assert file_path.exists()
|
assert file_path.exists()
|
||||||
# Verify it's a valid image file by opening it
|
# Verify it's a valid image file by opening it
|
||||||
image = Image.open(file_path)
|
image = Image.open(file_path)
|
||||||
@ -192,8 +194,8 @@ def test_make_chunk_single_image(mock_file_storage, db_session):
|
|||||||
|
|
||||||
assert chunk.id == "00000000-0000-0000-0000-000000000002"
|
assert chunk.id == "00000000-0000-0000-0000-000000000002"
|
||||||
assert chunk.content is None
|
assert chunk.content is None
|
||||||
assert chunk.file_path == (
|
assert chunk.file_path == str(
|
||||||
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000002.png",
|
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000002.png",
|
||||||
)
|
)
|
||||||
assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL
|
assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL
|
||||||
assert chunk.vector == vector
|
assert chunk.vector == vector
|
||||||
@ -217,8 +219,8 @@ def test_make_chunk_mixed_content(mock_file_storage, db_session):
|
|||||||
|
|
||||||
assert chunk.id == "00000000-0000-0000-0000-000000000003"
|
assert chunk.id == "00000000-0000-0000-0000-000000000003"
|
||||||
assert chunk.content is None
|
assert chunk.content is None
|
||||||
assert chunk.file_path == (
|
assert chunk.file_path == str(
|
||||||
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_*",
|
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_*",
|
||||||
)
|
)
|
||||||
assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL
|
assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL
|
||||||
assert chunk.vector == vector
|
assert chunk.vector == vector
|
||||||
@ -226,8 +228,8 @@ def test_make_chunk_mixed_content(mock_file_storage, db_session):
|
|||||||
|
|
||||||
# Verify the files exist
|
# Verify the files exist
|
||||||
assert (
|
assert (
|
||||||
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_0.txt"
|
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_0.txt"
|
||||||
).exists()
|
).exists()
|
||||||
assert (
|
assert (
|
||||||
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_1.png"
|
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_1.png"
|
||||||
).exists()
|
).exists()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user