mirror of
https://github.com/mruwnik/memory.git
synced 2025-06-08 05:14:43 +02:00
proper chunks path
This commit is contained in:
parent
44de394eb1
commit
c6cd809eb7
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,3 +1,4 @@
|
||||
memory_files
|
||||
.env
|
||||
.DS_Store
|
||||
secrets/
|
||||
@ -13,4 +14,4 @@ __pycache__/
|
||||
*.pyzw
|
||||
|
||||
docker-compose.override.yml
|
||||
docker/pgadmin
|
||||
docker/pgadmin
|
||||
|
@ -2,21 +2,21 @@ version: "3.9"
|
||||
|
||||
# --------------------------------------------------------------------- networks
|
||||
networks:
|
||||
kbnet: # internal overlay – NOT exposed
|
||||
kbnet:
|
||||
# internal overlay – NOT exposed
|
||||
driver: bridge
|
||||
|
||||
# --------------------------------------------------------------------- secrets
|
||||
secrets:
|
||||
postgres_password: {file: ./secrets/postgres_password.txt}
|
||||
jwt_secret: {file: ./secrets/jwt_secret.txt}
|
||||
openai_key: {file: ./secrets/openai_key.txt}
|
||||
postgres_password: { file: ./secrets/postgres_password.txt }
|
||||
jwt_secret: { file: ./secrets/jwt_secret.txt }
|
||||
openai_key: { file: ./secrets/openai_key.txt }
|
||||
|
||||
# --------------------------------------------------------------------- volumes
|
||||
volumes:
|
||||
db_data: {} # Postgres
|
||||
qdrant_data: {} # Qdrant
|
||||
rabbitmq_data: {} # RabbitMQ
|
||||
file_storage: {} # File storage
|
||||
db_data: {} # Postgres
|
||||
qdrant_data: {} # Qdrant
|
||||
rabbitmq_data: {} # RabbitMQ
|
||||
|
||||
# ------------------------------ X-templates ----------------------------
|
||||
x-common-env: &env
|
||||
@ -26,60 +26,60 @@ x-common-env: &env
|
||||
FILE_STORAGE_DIR: /app/memory_files
|
||||
TZ: "Etc/UTC"
|
||||
|
||||
|
||||
x-worker-base: &worker-base
|
||||
build:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/workers/Dockerfile
|
||||
restart: unless-stopped
|
||||
networks: [kbnet]
|
||||
security_opt: ["no-new-privileges=true"]
|
||||
depends_on: [postgres, rabbitmq, qdrant]
|
||||
env_file: [.env]
|
||||
networks: [ kbnet ]
|
||||
security_opt: [ "no-new-privileges=true" ]
|
||||
depends_on: [ postgres, rabbitmq, qdrant ]
|
||||
env_file: [ .env ]
|
||||
environment: &worker-env
|
||||
<<: *env
|
||||
POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
|
||||
# DSNs are built in worker entrypoint from user + pw files
|
||||
QDRANT_URL: http://qdrant:6333
|
||||
OPENAI_API_KEY_FILE: /run/secrets/openai_key
|
||||
secrets: [postgres_password, openai_key]
|
||||
secrets: [ postgres_password, openai_key ]
|
||||
read_only: true
|
||||
tmpfs: [/tmp,/var/tmp]
|
||||
cap_drop: [ALL]
|
||||
tmpfs: [ /tmp, /var/tmp ]
|
||||
cap_drop: [ ALL ]
|
||||
volumes:
|
||||
- file_storage:/app/memory_files:rw
|
||||
- ./memory_files:/app/memory_files:rw
|
||||
logging:
|
||||
options: {max-size: "10m", max-file: "3"}
|
||||
options: { max-size: "10m", max-file: "3" }
|
||||
user: kb
|
||||
|
||||
# ================================ SERVICES ============================
|
||||
|
||||
services:
|
||||
# ----------------------------------------------------------------- data layer
|
||||
# ----------------------------------------------------------------- data layer
|
||||
postgres:
|
||||
image: postgres:15
|
||||
restart: unless-stopped
|
||||
networks: [kbnet]
|
||||
networks: [ kbnet ]
|
||||
environment:
|
||||
<<: *env
|
||||
POSTGRES_USER: kb
|
||||
POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
|
||||
POSTGRES_DB: kb
|
||||
secrets: [postgres_password]
|
||||
secrets: [ postgres_password ]
|
||||
volumes:
|
||||
- db_data:/var/lib/postgresql/data:rw
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U kb"]
|
||||
test: [ "CMD-SHELL", "pg_isready -U kb" ]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
mem_limit: 4g
|
||||
cpus: "1.5"
|
||||
security_opt: ["no-new-privileges=true"]
|
||||
security_opt: [ "no-new-privileges=true" ]
|
||||
|
||||
rabbitmq:
|
||||
image: rabbitmq:3.13-management
|
||||
restart: unless-stopped
|
||||
networks: [kbnet]
|
||||
networks: [ kbnet ]
|
||||
environment:
|
||||
<<: *env
|
||||
RABBITMQ_DEFAULT_USER: "kb"
|
||||
@ -87,20 +87,21 @@ services:
|
||||
volumes:
|
||||
- rabbitmq_data:/var/lib/rabbitmq:rw
|
||||
healthcheck:
|
||||
test: ["CMD", "rabbitmq-diagnostics", "ping"]
|
||||
test: [ "CMD", "rabbitmq-diagnostics", "ping" ]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
mem_limit: 512m
|
||||
cpus: "0.5"
|
||||
security_opt: ["no-new-privileges=true"]
|
||||
ports: # UI only on localhost
|
||||
security_opt: [ "no-new-privileges=true" ]
|
||||
ports:
|
||||
# UI only on localhost
|
||||
- "127.0.0.1:15672:15672"
|
||||
|
||||
qdrant:
|
||||
image: qdrant/qdrant:v1.14.0
|
||||
restart: unless-stopped
|
||||
networks: [kbnet]
|
||||
networks: [ kbnet ]
|
||||
volumes:
|
||||
- qdrant_data:/qdrant/storage:rw
|
||||
tmpfs:
|
||||
@ -108,16 +109,16 @@ services:
|
||||
- /var/tmp
|
||||
- /qdrant/snapshots:rw
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-q", "-T", "2", "-O", "-", "localhost:6333/ready"]
|
||||
test: [ "CMD", "wget", "-q", "-T", "2", "-O", "-", "localhost:6333/ready" ]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
mem_limit: 4g
|
||||
cpus: "2"
|
||||
security_opt: ["no-new-privileges=true"]
|
||||
cap_drop: [ALL]
|
||||
security_opt: [ "no-new-privileges=true" ]
|
||||
cap_drop: [ ALL ]
|
||||
|
||||
# ------------------------------------------------------------ API / gateway
|
||||
# ------------------------------------------------------------ API / gateway
|
||||
# api:
|
||||
# build:
|
||||
# context: .
|
||||
@ -148,7 +149,7 @@ services:
|
||||
traefik:
|
||||
image: traefik:v3.0
|
||||
restart: unless-stopped
|
||||
networks: [kbnet]
|
||||
networks: [ kbnet ]
|
||||
command:
|
||||
- "--providers.docker=true"
|
||||
- "--providers.docker.network=kbnet"
|
||||
@ -166,79 +167,79 @@ services:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
# - ./acme.json:/acme.json:rw
|
||||
|
||||
# ------------------------------------------------------------ Celery workers
|
||||
# ------------------------------------------------------------ Celery workers
|
||||
worker-email:
|
||||
<<: *worker-base
|
||||
environment:
|
||||
<<: *worker-env
|
||||
QUEUES: "email"
|
||||
deploy: {resources: {limits: {cpus: "2", memory: 3g}}}
|
||||
deploy: { resources: { limits: { cpus: "2", memory: 3g } } }
|
||||
|
||||
worker-text:
|
||||
<<: *worker-base
|
||||
environment:
|
||||
<<: *worker-env
|
||||
QUEUES: "medium_embed"
|
||||
deploy: {resources: {limits: {cpus: "2", memory: 3g}}}
|
||||
deploy: { resources: { limits: { cpus: "2", memory: 3g } } }
|
||||
|
||||
worker-photo:
|
||||
<<: *worker-base
|
||||
environment:
|
||||
<<: *worker-env
|
||||
QUEUES: "photo_embed"
|
||||
deploy: {resources: {limits: {cpus: "4", memory: 4g}}}
|
||||
deploy: { resources: { limits: { cpus: "4", memory: 4g } } }
|
||||
|
||||
worker-ocr:
|
||||
<<: *worker-base
|
||||
environment:
|
||||
<<: *worker-env
|
||||
QUEUES: "low_ocr"
|
||||
deploy: {resources: {limits: {cpus: "4", memory: 4g}}}
|
||||
deploy: { resources: { limits: { cpus: "4", memory: 4g } } }
|
||||
|
||||
worker-git:
|
||||
<<: *worker-base
|
||||
environment:
|
||||
<<: *worker-env
|
||||
QUEUES: "git_summary"
|
||||
deploy: {resources: {limits: {cpus: "1", memory: 1g}}}
|
||||
deploy: { resources: { limits: { cpus: "1", memory: 1g } } }
|
||||
|
||||
worker-rss:
|
||||
<<: *worker-base
|
||||
environment:
|
||||
<<: *worker-env
|
||||
QUEUES: "rss"
|
||||
deploy: {resources: {limits: {cpus: "0.5", memory: 512m}}}
|
||||
deploy: { resources: { limits: { cpus: "0.5", memory: 512m } } }
|
||||
|
||||
worker-docs:
|
||||
<<: *worker-base
|
||||
environment:
|
||||
<<: *worker-env
|
||||
QUEUES: "docs"
|
||||
deploy: {resources: {limits: {cpus: "1", memory: 1g}}}
|
||||
deploy: { resources: { limits: { cpus: "1", memory: 1g } } }
|
||||
|
||||
ingest-hub:
|
||||
<<: *worker-base
|
||||
build:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/ingest_hub/Dockerfile
|
||||
environment:
|
||||
<<: *worker-env
|
||||
volumes:
|
||||
- file_storage:/app/memory_files:rw
|
||||
- ./memory_files:/app/memory_files:rw
|
||||
tmpfs:
|
||||
- /tmp
|
||||
- /var/tmp
|
||||
- /var/log/supervisor
|
||||
- /var/run/supervisor
|
||||
deploy: {resources: {limits: {cpus: "0.5", memory: 512m}}}
|
||||
deploy: { resources: { limits: { cpus: "0.5", memory: 512m } } }
|
||||
|
||||
# ------------------------------------------------------------ watchtower (auto-update)
|
||||
# ------------------------------------------------------------ watchtower (auto-update)
|
||||
watchtower:
|
||||
image: containrrr/watchtower
|
||||
restart: unless-stopped
|
||||
command: ["--schedule", "0 0 4 * * *", "--cleanup"]
|
||||
volumes: ["/var/run/docker.sock:/var/run/docker.sock:ro"]
|
||||
networks: [kbnet]
|
||||
command: [ "--schedule", "0 0 4 * * *", "--cleanup" ]
|
||||
volumes: [ "/var/run/docker.sock:/var/run/docker.sock:ro" ]
|
||||
networks: [ kbnet ]
|
||||
|
||||
# ------------------------------------------------------------------- profiles: observability (opt-in)
|
||||
# services:
|
||||
|
@ -21,8 +21,11 @@ RUN chmod +x entry.sh
|
||||
# Create required tmpfs directories for supervisor
|
||||
RUN mkdir -p /var/log/supervisor /var/run/supervisor
|
||||
|
||||
# Create storage directory
|
||||
RUN mkdir -p /app/memory_files
|
||||
|
||||
# Create user and set permissions
|
||||
RUN useradd -m kb && chown -R kb /app /var/log/supervisor /var/run/supervisor
|
||||
RUN useradd -m kb && chown -R kb /app /var/log/supervisor /var/run/supervisor /app/memory_files
|
||||
USER kb
|
||||
|
||||
# Default queues to process
|
||||
|
@ -17,6 +17,8 @@ RUN apt-get update && apt-get install -y \
|
||||
COPY docker/workers/entry.sh ./entry.sh
|
||||
RUN chmod +x entry.sh
|
||||
|
||||
RUN mkdir -p /app/memory_files
|
||||
|
||||
# Create user and set permissions
|
||||
RUN useradd -m kb && chown -R kb /app
|
||||
USER kb
|
||||
|
@ -128,13 +128,13 @@ def embed_page(page: dict[str, Any]) -> list[Vector]:
|
||||
|
||||
def write_to_file(chunk_id: str, item: extract.MulitmodalChunk) -> pathlib.Path:
|
||||
if isinstance(item, str):
|
||||
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.txt"
|
||||
filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.txt"
|
||||
filename.write_text(item)
|
||||
elif isinstance(item, bytes):
|
||||
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.bin"
|
||||
filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.bin"
|
||||
filename.write_bytes(item)
|
||||
elif isinstance(item, Image.Image):
|
||||
filename = settings.FILE_STORAGE_DIR / f"{chunk_id}.png"
|
||||
filename = settings.CHUNK_STORAGE_DIR / f"{chunk_id}.png"
|
||||
item.save(filename)
|
||||
else:
|
||||
raise ValueError(f"Unsupported content type: {type(item)}")
|
||||
@ -156,13 +156,13 @@ def make_chunk(
|
||||
content = "\n\n".join(contents)
|
||||
model = settings.TEXT_EMBEDDING_MODEL
|
||||
elif len(contents) == 1:
|
||||
filename = (write_to_file(chunk_id, contents[0]),)
|
||||
filename = write_to_file(chunk_id, contents[0]).absolute().as_posix()
|
||||
model = settings.MIXED_EMBEDDING_MODEL
|
||||
else:
|
||||
for i, item in enumerate(contents):
|
||||
write_to_file(f"{chunk_id}_{i}", item)
|
||||
model = settings.MIXED_EMBEDDING_MODEL
|
||||
filename = (settings.FILE_STORAGE_DIR / f"{chunk_id}_*",)
|
||||
filename = (settings.CHUNK_STORAGE_DIR / f"{chunk_id}_*").absolute().as_posix()
|
||||
|
||||
return Chunk(
|
||||
id=chunk_id,
|
||||
|
@ -4,9 +4,11 @@ from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
def boolean_env(key: str, default: bool = False) -> bool:
|
||||
return os.getenv(key, "0").lower() in ("1", "true", "yes")
|
||||
|
||||
|
||||
# Database settings
|
||||
DB_USER = os.getenv("DB_USER", "kb")
|
||||
if password_file := os.getenv("POSTGRES_PASSWORD_FILE"):
|
||||
@ -18,23 +20,33 @@ DB_HOST = os.getenv("DB_HOST", "postgres")
|
||||
DB_PORT = os.getenv("DB_PORT", "5432")
|
||||
DB_NAME = os.getenv("DB_NAME", "kb")
|
||||
|
||||
def make_db_url(user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT, db=DB_NAME):
|
||||
|
||||
def make_db_url(
|
||||
user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT, db=DB_NAME
|
||||
):
|
||||
return f"postgresql://{user}:{password}@{host}:{port}/{db}"
|
||||
|
||||
|
||||
DB_URL = os.getenv("DATABASE_URL", make_db_url())
|
||||
|
||||
|
||||
FILE_STORAGE_DIR = pathlib.Path(os.getenv("FILE_STORAGE_DIR", "/tmp/memory_files"))
|
||||
FILE_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
CHUNK_STORAGE_DIR = pathlib.Path(
|
||||
os.getenv("CHUNK_STORAGE_DIR", FILE_STORAGE_DIR / "chunks")
|
||||
)
|
||||
CHUNK_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
# Maximum attachment size to store directly in the database (10MB)
|
||||
MAX_INLINE_ATTACHMENT_SIZE = int(os.getenv("MAX_INLINE_ATTACHMENT_SIZE", 1 * 1024 * 1024))
|
||||
MAX_INLINE_ATTACHMENT_SIZE = int(
|
||||
os.getenv("MAX_INLINE_ATTACHMENT_SIZE", 1 * 1024 * 1024)
|
||||
)
|
||||
|
||||
# Qdrant settings
|
||||
QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant")
|
||||
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
|
||||
QDRANT_GRPC_PORT = int(os.getenv("QDRANT_GRPC_PORT", "6334"))
|
||||
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", None)
|
||||
QDRANT_PREFER_GRPC = boolean_env("QDRANT_PREFER_GRPC", False)
|
||||
QDRANT_PREFER_GRPC = boolean_env("QDRANT_PREFER_GRPC", False)
|
||||
QDRANT_TIMEOUT = int(os.getenv("QDRANT_TIMEOUT", "60"))
|
||||
|
||||
|
||||
|
@ -55,7 +55,7 @@ def drop_test_database(test_db_name: str) -> None:
|
||||
|
||||
with admin_engine.connect() as conn:
|
||||
conn.execute(text("COMMIT")) # Close any open transaction
|
||||
|
||||
|
||||
# Terminate all connections to the database
|
||||
conn.execute(
|
||||
text(
|
||||
@ -67,10 +67,10 @@ def drop_test_database(test_db_name: str) -> None:
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# Drop the database
|
||||
conn.execute(text(f"DROP DATABASE IF EXISTS {test_db_name}"))
|
||||
|
||||
|
||||
admin_engine.dispose()
|
||||
|
||||
|
||||
@ -199,8 +199,11 @@ def email_provider():
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_file_storage(tmp_path: Path):
|
||||
chunk_storage_dir = tmp_path / "chunks"
|
||||
chunk_storage_dir.mkdir(parents=True, exist_ok=True)
|
||||
with patch("memory.common.settings.FILE_STORAGE_DIR", tmp_path):
|
||||
yield
|
||||
with patch("memory.common.settings.CHUNK_STORAGE_DIR", chunk_storage_dir):
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -215,4 +218,4 @@ def qdrant():
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_voyage_client():
|
||||
with patch.object(voyageai, "Client", autospec=True) as mock_client:
|
||||
yield mock_client()
|
||||
yield mock_client()
|
||||
|
@ -1,18 +1,20 @@
|
||||
import uuid
|
||||
import pytest
|
||||
from unittest.mock import Mock, patch
|
||||
from PIL import Image
|
||||
import pathlib
|
||||
import uuid
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from memory.common import settings
|
||||
from memory.common.embedding import (
|
||||
get_modality,
|
||||
embed_text,
|
||||
embed,
|
||||
embed_file,
|
||||
embed_mixed,
|
||||
embed_page,
|
||||
embed,
|
||||
write_to_file,
|
||||
embed_text,
|
||||
get_modality,
|
||||
make_chunk,
|
||||
write_to_file,
|
||||
)
|
||||
|
||||
|
||||
@ -116,7 +118,7 @@ def test_write_to_file_text(mock_file_storage):
|
||||
|
||||
file_path = write_to_file(chunk_id, content)
|
||||
|
||||
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.txt"
|
||||
assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.txt"
|
||||
assert file_path.exists()
|
||||
assert file_path.read_text() == content
|
||||
|
||||
@ -128,7 +130,7 @@ def test_write_to_file_bytes(mock_file_storage):
|
||||
|
||||
file_path = write_to_file(chunk_id, content)
|
||||
|
||||
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.bin"
|
||||
assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.bin"
|
||||
assert file_path.exists()
|
||||
assert file_path.read_bytes() == content
|
||||
|
||||
@ -140,7 +142,7 @@ def test_write_to_file_image(mock_file_storage):
|
||||
|
||||
file_path = write_to_file(chunk_id, img)
|
||||
|
||||
assert file_path == settings.FILE_STORAGE_DIR / f"{chunk_id}.png"
|
||||
assert file_path == settings.CHUNK_STORAGE_DIR / f"{chunk_id}.png"
|
||||
assert file_path.exists()
|
||||
# Verify it's a valid image file by opening it
|
||||
image = Image.open(file_path)
|
||||
@ -192,8 +194,8 @@ def test_make_chunk_single_image(mock_file_storage, db_session):
|
||||
|
||||
assert chunk.id == "00000000-0000-0000-0000-000000000002"
|
||||
assert chunk.content is None
|
||||
assert chunk.file_path == (
|
||||
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000002.png",
|
||||
assert chunk.file_path == str(
|
||||
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000002.png",
|
||||
)
|
||||
assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL
|
||||
assert chunk.vector == vector
|
||||
@ -217,8 +219,8 @@ def test_make_chunk_mixed_content(mock_file_storage, db_session):
|
||||
|
||||
assert chunk.id == "00000000-0000-0000-0000-000000000003"
|
||||
assert chunk.content is None
|
||||
assert chunk.file_path == (
|
||||
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_*",
|
||||
assert chunk.file_path == str(
|
||||
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_*",
|
||||
)
|
||||
assert chunk.embedding_model == settings.MIXED_EMBEDDING_MODEL
|
||||
assert chunk.vector == vector
|
||||
@ -226,8 +228,8 @@ def test_make_chunk_mixed_content(mock_file_storage, db_session):
|
||||
|
||||
# Verify the files exist
|
||||
assert (
|
||||
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_0.txt"
|
||||
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_0.txt"
|
||||
).exists()
|
||||
assert (
|
||||
settings.FILE_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_1.png"
|
||||
settings.CHUNK_STORAGE_DIR / "00000000-0000-0000-0000-000000000003_1.png"
|
||||
).exists()
|
||||
|
Loading…
x
Reference in New Issue
Block a user