Compare commits

...

6 Commits

Author SHA1 Message Date
Daniel O'Connell
a3daea883b fix tests 2025-06-26 14:12:42 +02:00
EC2 Default User
80020e2a61 proper build for frontend 2025-06-26 11:12:50 +00:00
EC2 Default User
a424c5f4d0 properly build frontend 2025-06-26 10:54:24 +00:00
EC2 Default User
f0d441ffe9 use LW in search 2025-06-26 10:32:36 +00:00
Daniel O'Connell
1538d38bf6 proper parsing for SSC 2025-06-26 10:27:00 +02:00
EC2 Default User
4049cf15b4 better api build 2025-06-26 08:07:32 +00:00
11 changed files with 94 additions and 29 deletions

4
.gitignore vendored
View File

@ -1,4 +1,8 @@
Books
CLAUDE.md
memory_files memory_files
venv
.env .env
.DS_Store .DS_Store
secrets/ secrets/

View File

@ -141,6 +141,9 @@ services:
build: build:
context: . context: .
dockerfile: docker/api/Dockerfile dockerfile: docker/api/Dockerfile
args:
SERVER_URL: "${SERVER_URL:-http://localhost:8000}"
SESSION_COOKIE_NAME: "${SESSION_COOKIE_NAME:-session_id}"
restart: unless-stopped restart: unless-stopped
networks: [kbnet] networks: [kbnet]
depends_on: [postgres, rabbitmq, qdrant] depends_on: [postgres, rabbitmq, qdrant]
@ -152,6 +155,7 @@ services:
VITE_SERVER_URL: "${SERVER_URL:-http://localhost:8000}" VITE_SERVER_URL: "${SERVER_URL:-http://localhost:8000}"
STATIC_DIR: "/app/static" STATIC_DIR: "/app/static"
VOYAGE_API_KEY: ${VOYAGE_API_KEY} VOYAGE_API_KEY: ${VOYAGE_API_KEY}
ENABLE_BM25_SEARCH: false
secrets: [postgres_password] secrets: [postgres_password]
volumes: volumes:
- ./memory_files:/app/memory_files:rw - ./memory_files:/app/memory_files:rw

View File

@ -1,14 +1,5 @@
# Frontend build stage # Backend base stage
FROM node:18-alpine AS frontend-builder FROM python:3.11-slim AS backend-base
WORKDIR /frontend
COPY frontend/package*.json ./
RUN npm install
COPY frontend/ ./
RUN npm run build
# Backend build stage
FROM python:3.11-slim
WORKDIR /app WORKDIR /app
@ -19,18 +10,37 @@ RUN apt-get update && apt-get install -y \
python3-dev \ python3-dev \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Copy requirements files and setup # Copy and install Python requirements
COPY requirements ./requirements/ COPY requirements ./requirements/
RUN mkdir src RUN mkdir src
COPY setup.py ./ COPY setup.py ./
# Do an initial install to get the dependencies cached # Do an initial install to get the dependencies cached
RUN pip install -e ".[api]" RUN pip install -e ".[api]"
# Install the package with common dependencies # Frontend build stage
FROM node:18-alpine AS frontend-builder
WORKDIR /frontend
COPY frontend/package*.json ./
RUN npm install
COPY frontend/ ./
# Set Vite environment variables for build from build args
ARG SERVER_URL
ARG SESSION_COOKIE_NAME
ENV VITE_SERVER_URL=${SERVER_URL}
ENV VITE_SESSION_COOKIE_NAME=${SESSION_COOKIE_NAME}
RUN npm run build
# Final stage
FROM backend-base
# Install the package with Python source code
COPY src/ ./src/ COPY src/ ./src/
RUN pip install -e ".[api]" RUN pip install -e ".[api]"
# Copy frontend build output from previous stage # Copy frontend build output from frontend stage
COPY --from=frontend-builder /frontend/dist ./static/ COPY --from=frontend-builder /frontend/dist ./static/
# Run as non-root user # Run as non-root user

View File

@ -6,18 +6,23 @@ import asyncio
import logging import logging
from typing import Optional from typing import Optional
from memory.api.search.embeddings import search_embeddings from memory.common import extract, settings
from memory.api.search.bm25 import search_bm25
from memory.api.search.utils import SearchFilters, SearchResult
from memory.api.search.utils import group_chunks, with_timeout
from memory.common import extract
from memory.common.collections import ( from memory.common.collections import (
ALL_COLLECTIONS, ALL_COLLECTIONS,
MULTIMODAL_COLLECTIONS, MULTIMODAL_COLLECTIONS,
TEXT_COLLECTIONS, TEXT_COLLECTIONS,
) )
from memory.common import settings from memory.api.search.embeddings import search_embeddings
if settings.ENABLE_BM25_SEARCH:
from memory.api.search.bm25 import search_bm25
from memory.api.search.utils import (
SearchFilters,
SearchResult,
group_chunks,
with_timeout,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View File

@ -370,7 +370,7 @@ class SourceItem(Base):
@property @property
def display_contents(self) -> str | dict | None: def display_contents(self) -> str | dict | None:
payload = self.as_payload() payload = self.as_payload()
payload.pop("id", None) # type: ignore payload.pop("source_id", None) # type: ignore
return { return {
**payload, **payload,
"tags": self.tags, "tags": self.tags,

View File

@ -21,7 +21,7 @@ from sqlalchemy import (
Text, Text,
func, func,
) )
from sqlalchemy.dialects.postgresql import JSONB, TSVECTOR, UUID from sqlalchemy.dialects.postgresql import JSONB, TSVECTOR
from sqlalchemy.orm import relationship from sqlalchemy.orm import relationship
from memory.common import settings from memory.common import settings
@ -31,7 +31,6 @@ import memory.common.formatters.observation as observation
from memory.common.db.models.source_item import ( from memory.common.db.models.source_item import (
SourceItem, SourceItem,
Chunk,
SourceItemPayload, SourceItemPayload,
clean_filename, clean_filename,
chunk_mixed, chunk_mixed,
@ -92,9 +91,9 @@ class MailMessage(SourceItem):
def as_payload(self) -> MailMessagePayload: def as_payload(self) -> MailMessagePayload:
base_payload = super().as_payload() | { base_payload = super().as_payload() | {
"tags": cast(list[str], self.tags) "tags": (cast(list[str], self.tags) or [])
+ [cast(str, self.sender)] + [cast(str, self.sender)]
+ cast(list[str], self.recipients) + (cast(list[str], self.recipients) or [])
} }
return MailMessagePayload( return MailMessagePayload(
**cast(dict, base_payload), **cast(dict, base_payload),
@ -576,6 +575,11 @@ class ForumPost(SourceItem):
def _chunk_contents(self) -> Sequence[extract.DataChunk]: def _chunk_contents(self) -> Sequence[extract.DataChunk]:
return chunk_mixed(cast(str, self.content), cast(list[str], self.images)) return chunk_mixed(cast(str, self.content), cast(list[str], self.images))
@classmethod
def get_collections(cls) -> list[str]:
# Very sad that I didn't keep the names consistent... Qdrant doesn't allow renaming collections
return ["forum"]
class MiscDoc(SourceItem): class MiscDoc(SourceItem):
__tablename__ = "misc_doc" __tablename__ = "misc_doc"

View File

@ -532,6 +532,36 @@ class NadiaXyzParser(BaseHTMLParser):
] ]
class SlateStarCodexParser(BaseHTMLParser):
"""Parser for slatestarcodex.com (Scott Alexander's blog)."""
article_selector = ".post, .hentry, [id^='post-']"
title_selector = "h1.pjgm-posttitle, h1"
author_selector = ".author.vcard a, .url.fn.n"
date_selector = ".entry-date"
date_format = "%B %d, %Y" # "January 21, 2021" format
content_selector = ".pjgm-postcontent"
author = "Scott Alexander"
remove_selectors = BaseHTMLParser.remove_selectors + [
".pjgm-postmeta",
".pjgm-postutility",
".pjgm-navigation",
"#pjgm-navbelow",
"#comments",
".commentlist",
".widget-area",
"#left-sidebar",
"#primary",
".sidebar-toggle",
".aar_div", # Advertisement divs
".pjgm-header",
".pjgm-footer",
"#pjgm-menubar",
"#pjgm-bigtitle",
]
class BloombergParser(BaseHTMLParser): class BloombergParser(BaseHTMLParser):
"""Parser for bloomberg.com.""" """Parser for bloomberg.com."""
@ -578,6 +608,7 @@ PARSER_REGISTRY = {
r"theredhandfiles\.com": TheRedHandFilesParser, r"theredhandfiles\.com": TheRedHandFilesParser,
r"rachelbythebay\.com": RachelByTheBayParser, r"rachelbythebay\.com": RachelByTheBayParser,
r"nadia\.xyz": NadiaXyzParser, r"nadia\.xyz": NadiaXyzParser,
r"slatestarcodex\.com": SlateStarCodexParser,
} }

View File

@ -4,8 +4,10 @@ from typing import cast
import pytest import pytest
from PIL import Image from PIL import Image
from memory.common import settings, chunker, extract from memory.common import settings, chunker, extract
from memory.common.db.models.source_items import ( from memory.common.db.models.source_item import (
Chunk, Chunk,
)
from memory.common.db.models.source_items import (
MailMessage, MailMessage,
) )
from memory.common.db.models.source_item import ( from memory.common.db.models.source_item import (

View File

@ -203,6 +203,8 @@ Test Body Content"""
"sender": "sender@example.com", "sender": "sender@example.com",
"recipients": ["recipient@example.com"], "recipients": ["recipient@example.com"],
"tags": None, "tags": None,
"folder": None,
"message_id": "<test@example.com>",
} }

View File

@ -3,6 +3,7 @@ from pathlib import Path
from unittest.mock import patch, Mock from unittest.mock import patch, Mock
from memory.common.db.models import Book, BookSection from memory.common.db.models import Book, BookSection
from memory.common import settings
from memory.parsers.ebook import Ebook, Section from memory.parsers.ebook import Ebook, Section
from memory.workers.tasks import ebook from memory.workers.tasks import ebook
@ -46,7 +47,7 @@ def mock_ebook():
end_page=20, end_page=20,
), ),
], ],
file_path=Path("/test/book.epub"), file_path=settings.FILE_STORAGE_DIR / "test/book.epub",
n_pages=20, n_pages=20,
) )
@ -70,7 +71,7 @@ def test_create_book_from_ebook(mock_ebook):
assert book.author == "Test Author" # type: ignore assert book.author == "Test Author" # type: ignore
assert book.publisher == "Test Publisher" # type: ignore assert book.publisher == "Test Publisher" # type: ignore
assert book.language == "en" # type: ignore assert book.language == "en" # type: ignore
assert book.file_path == "/test/book.epub" # type: ignore assert book.file_path == "test/book.epub" # type: ignore
assert book.total_pages == 20 # type: ignore assert book.total_pages == 20 # type: ignore
assert book.book_metadata == { # type: ignore assert book.book_metadata == { # type: ignore
"language": "en", "language": "en",

View File

@ -257,6 +257,8 @@ def test_create_mail_message(db_session):
"recipients": ["recipient@example.com"], "recipients": ["recipient@example.com"],
"date": "2023-01-01T12:00:00+00:00", "date": "2023-01-01T12:00:00+00:00",
"mime_type": "message/rfc822", "mime_type": "message/rfc822",
"folder": "INBOX",
"message_id": "321",
"size": 412, "size": 412,
"tags": ["test"], "tags": ["test"],
"filename": None, "filename": None,