Compare commits

...

6 Commits

Author SHA1 Message Date
Daniel O'Connell
a3daea883b fix tests 2025-06-26 14:12:42 +02:00
EC2 Default User
80020e2a61 proper build for frontend 2025-06-26 11:12:50 +00:00
EC2 Default User
a424c5f4d0 properly build frontend 2025-06-26 10:54:24 +00:00
EC2 Default User
f0d441ffe9 use LW in search 2025-06-26 10:32:36 +00:00
Daniel O'Connell
1538d38bf6 proper parsing for SSC 2025-06-26 10:27:00 +02:00
EC2 Default User
4049cf15b4 better api build 2025-06-26 08:07:32 +00:00
11 changed files with 94 additions and 29 deletions

4
.gitignore vendored
View File

@ -1,4 +1,8 @@
Books
CLAUDE.md
memory_files
venv
.env
.DS_Store
secrets/

View File

@ -141,6 +141,9 @@ services:
build:
context: .
dockerfile: docker/api/Dockerfile
args:
SERVER_URL: "${SERVER_URL:-http://localhost:8000}"
SESSION_COOKIE_NAME: "${SESSION_COOKIE_NAME:-session_id}"
restart: unless-stopped
networks: [kbnet]
depends_on: [postgres, rabbitmq, qdrant]
@ -152,6 +155,7 @@ services:
VITE_SERVER_URL: "${SERVER_URL:-http://localhost:8000}"
STATIC_DIR: "/app/static"
VOYAGE_API_KEY: ${VOYAGE_API_KEY}
ENABLE_BM25_SEARCH: false
secrets: [postgres_password]
volumes:
- ./memory_files:/app/memory_files:rw

View File

@ -1,14 +1,5 @@
# Frontend build stage
FROM node:18-alpine AS frontend-builder
WORKDIR /frontend
COPY frontend/package*.json ./
RUN npm install
COPY frontend/ ./
RUN npm run build
# Backend build stage
FROM python:3.11-slim
# Backend base stage
FROM python:3.11-slim AS backend-base
WORKDIR /app
@ -19,18 +10,37 @@ RUN apt-get update && apt-get install -y \
python3-dev \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements files and setup
# Copy and install Python requirements
COPY requirements ./requirements/
RUN mkdir src
COPY setup.py ./
# Do an initial install to get the dependencies cached
RUN pip install -e ".[api]"
# Install the package with common dependencies
# Frontend build stage
FROM node:18-alpine AS frontend-builder
WORKDIR /frontend
COPY frontend/package*.json ./
RUN npm install
COPY frontend/ ./
# Set Vite environment variables for build from build args
ARG SERVER_URL
ARG SESSION_COOKIE_NAME
ENV VITE_SERVER_URL=${SERVER_URL}
ENV VITE_SESSION_COOKIE_NAME=${SESSION_COOKIE_NAME}
RUN npm run build
# Final stage
FROM backend-base
# Install the package with Python source code
COPY src/ ./src/
RUN pip install -e ".[api]"
# Copy frontend build output from previous stage
# Copy frontend build output from frontend stage
COPY --from=frontend-builder /frontend/dist ./static/
# Run as non-root user

View File

@ -6,18 +6,23 @@ import asyncio
import logging
from typing import Optional
from memory.api.search.embeddings import search_embeddings
from memory.api.search.bm25 import search_bm25
from memory.api.search.utils import SearchFilters, SearchResult
from memory.api.search.utils import group_chunks, with_timeout
from memory.common import extract
from memory.common import extract, settings
from memory.common.collections import (
ALL_COLLECTIONS,
MULTIMODAL_COLLECTIONS,
TEXT_COLLECTIONS,
)
from memory.common import settings
from memory.api.search.embeddings import search_embeddings
if settings.ENABLE_BM25_SEARCH:
from memory.api.search.bm25 import search_bm25
from memory.api.search.utils import (
SearchFilters,
SearchResult,
group_chunks,
with_timeout,
)
logger = logging.getLogger(__name__)

View File

@ -370,7 +370,7 @@ class SourceItem(Base):
@property
def display_contents(self) -> str | dict | None:
payload = self.as_payload()
payload.pop("id", None) # type: ignore
payload.pop("source_id", None) # type: ignore
return {
**payload,
"tags": self.tags,

View File

@ -21,7 +21,7 @@ from sqlalchemy import (
Text,
func,
)
from sqlalchemy.dialects.postgresql import JSONB, TSVECTOR, UUID
from sqlalchemy.dialects.postgresql import JSONB, TSVECTOR
from sqlalchemy.orm import relationship
from memory.common import settings
@ -31,7 +31,6 @@ import memory.common.formatters.observation as observation
from memory.common.db.models.source_item import (
SourceItem,
Chunk,
SourceItemPayload,
clean_filename,
chunk_mixed,
@ -92,9 +91,9 @@ class MailMessage(SourceItem):
def as_payload(self) -> MailMessagePayload:
base_payload = super().as_payload() | {
"tags": cast(list[str], self.tags)
"tags": (cast(list[str], self.tags) or [])
+ [cast(str, self.sender)]
+ cast(list[str], self.recipients)
+ (cast(list[str], self.recipients) or [])
}
return MailMessagePayload(
**cast(dict, base_payload),
@ -576,6 +575,11 @@ class ForumPost(SourceItem):
def _chunk_contents(self) -> Sequence[extract.DataChunk]:
return chunk_mixed(cast(str, self.content), cast(list[str], self.images))
@classmethod
def get_collections(cls) -> list[str]:
# Very sad that I didn't keep the names consistent... Qdrant doesn't allow renaming collections
return ["forum"]
class MiscDoc(SourceItem):
__tablename__ = "misc_doc"

View File

@ -532,6 +532,36 @@ class NadiaXyzParser(BaseHTMLParser):
]
class SlateStarCodexParser(BaseHTMLParser):
"""Parser for slatestarcodex.com (Scott Alexander's blog)."""
article_selector = ".post, .hentry, [id^='post-']"
title_selector = "h1.pjgm-posttitle, h1"
author_selector = ".author.vcard a, .url.fn.n"
date_selector = ".entry-date"
date_format = "%B %d, %Y" # "January 21, 2021" format
content_selector = ".pjgm-postcontent"
author = "Scott Alexander"
remove_selectors = BaseHTMLParser.remove_selectors + [
".pjgm-postmeta",
".pjgm-postutility",
".pjgm-navigation",
"#pjgm-navbelow",
"#comments",
".commentlist",
".widget-area",
"#left-sidebar",
"#primary",
".sidebar-toggle",
".aar_div", # Advertisement divs
".pjgm-header",
".pjgm-footer",
"#pjgm-menubar",
"#pjgm-bigtitle",
]
class BloombergParser(BaseHTMLParser):
"""Parser for bloomberg.com."""
@ -578,6 +608,7 @@ PARSER_REGISTRY = {
r"theredhandfiles\.com": TheRedHandFilesParser,
r"rachelbythebay\.com": RachelByTheBayParser,
r"nadia\.xyz": NadiaXyzParser,
r"slatestarcodex\.com": SlateStarCodexParser,
}

View File

@ -4,8 +4,10 @@ from typing import cast
import pytest
from PIL import Image
from memory.common import settings, chunker, extract
from memory.common.db.models.source_items import (
from memory.common.db.models.source_item import (
Chunk,
)
from memory.common.db.models.source_items import (
MailMessage,
)
from memory.common.db.models.source_item import (

View File

@ -203,6 +203,8 @@ Test Body Content"""
"sender": "sender@example.com",
"recipients": ["recipient@example.com"],
"tags": None,
"folder": None,
"message_id": "<test@example.com>",
}

View File

@ -3,6 +3,7 @@ from pathlib import Path
from unittest.mock import patch, Mock
from memory.common.db.models import Book, BookSection
from memory.common import settings
from memory.parsers.ebook import Ebook, Section
from memory.workers.tasks import ebook
@ -46,7 +47,7 @@ def mock_ebook():
end_page=20,
),
],
file_path=Path("/test/book.epub"),
file_path=settings.FILE_STORAGE_DIR / "test/book.epub",
n_pages=20,
)
@ -70,7 +71,7 @@ def test_create_book_from_ebook(mock_ebook):
assert book.author == "Test Author" # type: ignore
assert book.publisher == "Test Publisher" # type: ignore
assert book.language == "en" # type: ignore
assert book.file_path == "/test/book.epub" # type: ignore
assert book.file_path == "test/book.epub" # type: ignore
assert book.total_pages == 20 # type: ignore
assert book.book_metadata == { # type: ignore
"language": "en",

View File

@ -257,6 +257,8 @@ def test_create_mail_message(db_session):
"recipients": ["recipient@example.com"],
"date": "2023-01-01T12:00:00+00:00",
"mime_type": "message/rfc822",
"folder": "INBOX",
"message_id": "321",
"size": 412,
"tags": ["test"],
"filename": None,