better search

This commit is contained in:
EC2 Default User 2025-06-25 23:43:14 +00:00
parent 5e836337e2
commit 8855e8715a
9 changed files with 66 additions and 42 deletions

View File

@ -1,6 +1,7 @@
import { useState, useEffect } from 'react'
import ReactMarkdown from 'react-markdown'
import { useMCP } from '@/hooks/useMCP'
import { SERVER_URL } from '@/hooks/useAuth'
export type SearchItem = {
filename: string
@ -24,10 +25,10 @@ export const Tag = ({ tags }: { tags: string[] }) => {
export const TextResult = ({ filename, content, chunks, tags, metadata }: SearchItem) => {
return (
<div className="search-result-card">
<h4>{filename || 'Untitled'}</h4>
<h4>{filename || metadata?.title || metadata?.url || 'Untitled'}</h4>
<Tag tags={tags} />
<Metadata metadata={metadata} />
<p className="result-content">{content || 'No content available'}</p>
{content && <p className="result-content">{content}</p>}
{chunks && chunks.length > 0 && (
<details className="result-chunks">
<summary>Relevant sections:</summary>
@ -46,12 +47,14 @@ export const TextResult = ({ filename, content, chunks, tags, metadata }: Search
export const MarkdownResult = ({ filename, content, chunks, tags, metadata }: SearchItem) => {
return (
<div className="search-result-card">
<h4>{filename || 'Untitled'}</h4>
<h4>{filename || metadata?.title || metadata?.url || 'Untitled'}</h4>
<Tag tags={tags} />
<Metadata metadata={metadata} />
<div className="markdown-content">
<ReactMarkdown>{content || 'No content available'}</ReactMarkdown>
</div>
{content && (
<div className="markdown-content">
<ReactMarkdown>{content}</ReactMarkdown>
</div>
)}
{chunks && chunks.length > 0 && (
<details className="result-chunks">
<summary>Relevant sections:</summary>
@ -76,7 +79,7 @@ export const ImageResult = ({ filename, tags, metadata }: SearchItem) => {
const [content, setContent] = useState<string>()
useEffect(() => {
const fetchImage = async () => {
const files = await fetchFile(filename.replace('/app/memory_files/', ''))
const files = await fetchFile(filename)
const {mime_type, content} = files[0]
setMimeType(mime_type)
setContent(content)
@ -94,13 +97,26 @@ export const ImageResult = ({ filename, tags, metadata }: SearchItem) => {
)
}
const MetadataItem = ({ item, value }: { item: string, value: any }) => {
if (item === "url") {
return <li><a href={value}>{value}</a></li>
}
if (item === "filename") {
return <li><a href={`${SERVER_URL}/files/${value}`}>{value}</a></li>
}
if (typeof value === 'string') {
return <li>{item}: {value}</li>
}
return <li>{item}: {JSON.stringify(value)}</li>
}
export const Metadata = ({ metadata }: { metadata: any }) => {
if (!metadata) return null
return (
<div className="metadata">
<ul>
{Object.entries(metadata).map(([key, value]) => (
<li key={key}>{key}: {typeof value === 'string' ? value : JSON.stringify(value)}</li>
<MetadataItem key={key} item={key} value={value} />
))}
</ul>
</div>
@ -112,7 +128,7 @@ export const PDFResult = ({ filename, content, tags, metadata }: SearchItem) =>
<div className="search-result-card">
<h4>{filename || 'Untitled'}</h4>
<Tag tags={tags} />
<a href={`http://localhost:8000/files/${filename}`}>View PDF</a>
<a href={`${SERVER_URL}/files/${filename}`}>View PDF</a>
<Metadata metadata={metadata} />
{content && <div className="markdown-content">
<details>

View File

@ -1,7 +1,7 @@
import { useState, useEffect, useCallback } from 'react'
const SERVER_URL = import.meta.env.VITE_SERVER_URL || 'http://localhost:8000'
const SESSION_COOKIE_NAME = import.meta.env.VITE_SESSION_COOKIE_NAME || 'session_id'
export const SERVER_URL = import.meta.env.VITE_SERVER_URL || 'http://localhost:8000'
export const SESSION_COOKIE_NAME = import.meta.env.VITE_SESSION_COOKIE_NAME || 'session_id'
// Cookie utilities
const getCookie = (name: string) => {

View File

@ -96,6 +96,7 @@ class EmailAttachmentAdmin(ModelView, model=EmailAttachment):
column_searchable_list = [
"filename",
"mime_type",
"id",
]
@ -103,7 +104,7 @@ class BlogPostAdmin(ModelView, model=BlogPost):
column_list = source_columns(
BlogPost, "title", "author", "url", "published", "domain"
)
column_searchable_list = ["title", "author", "domain"]
column_searchable_list = ["title", "author", "domain", "id", "url"]
class ForumPostAdmin(ModelView, model=ForumPost):
@ -118,7 +119,7 @@ class ForumPostAdmin(ModelView, model=ForumPost):
"comments",
"score",
)
column_searchable_list = ["title", "authors"]
column_searchable_list = ["title", "authors", "id"]
class PhotoAdmin(ModelView, model=Photo):
@ -127,7 +128,7 @@ class PhotoAdmin(ModelView, model=Photo):
class ComicAdmin(ModelView, model=Comic):
column_list = source_columns(Comic, "title", "author", "published", "volume")
column_searchable_list = ["title", "author"]
column_searchable_list = ["title", "author", "id"]
class BookSectionAdmin(ModelView, model=BookSection):
@ -139,12 +140,12 @@ class BookSectionAdmin(ModelView, model=BookSection):
"start_page",
"end_page",
)
column_searchable_list = ["section_title"]
column_searchable_list = ["section_title", "id"]
class MiscDocAdmin(ModelView, model=MiscDoc):
column_list = source_columns(MiscDoc, "path")
column_searchable_list = ["path"]
column_searchable_list = ["path", "id"]
class BookAdmin(ModelView, model=Book):
@ -156,7 +157,7 @@ class BookAdmin(ModelView, model=Book):
"series_number",
"published",
]
column_searchable_list = ["title", "author"]
column_searchable_list = ["title", "author", "id"]
class ArticleFeedAdmin(ModelView, model=ArticleFeed):
@ -170,7 +171,7 @@ class ArticleFeedAdmin(ModelView, model=ArticleFeed):
"created_at",
"updated_at",
]
column_searchable_list = ["title", "url"]
column_searchable_list = ["title", "url", "id"]
class EmailAccountAdmin(ModelView, model=EmailAccount):
@ -186,7 +187,7 @@ class EmailAccountAdmin(ModelView, model=EmailAccount):
"created_at",
"updated_at",
]
column_searchable_list = ["name", "email_address"]
column_searchable_list = ["name", "email_address", "id"]
class AgentObservationAdmin(ModelView, model=AgentObservation):
@ -199,7 +200,7 @@ class AgentObservationAdmin(ModelView, model=AgentObservation):
"evidence",
"inserted_at",
]
column_searchable_list = ["subject", "observation_type"]
column_searchable_list = ["subject", "observation_type", "id"]
column_default_sort = [("inserted_at", True)]
column_sortable_list = ["inserted_at"]
@ -214,7 +215,7 @@ class NoteAdmin(ModelView, model=Note):
"tags",
"inserted_at",
]
column_searchable_list = ["subject", "content"]
column_searchable_list = ["subject", "content", "id"]
column_default_sort = [("inserted_at", True)]
column_sortable_list = ["inserted_at"]

View File

@ -5,6 +5,7 @@ FastAPI application for the knowledge base.
import contextlib
import os
import logging
import mimetypes
from fastapi import FastAPI, UploadFile, Request, HTTPException
from fastapi.responses import FileResponse
@ -55,7 +56,12 @@ async def serve_file(path: str):
file_path = settings.FILE_STORAGE_DIR / path
if not file_path.is_file():
raise HTTPException(status_code=404, detail="File not found")
return FileResponse(file_path)
mime_type, _ = mimetypes.guess_type(str(file_path))
if mime_type is None:
mime_type = "application/octet-stream"
return FileResponse(file_path, media_type=mime_type)
async def input_type(item: str | UploadFile) -> list[extract.DataChunk]:
@ -71,13 +77,6 @@ async def input_type(item: str | UploadFile) -> list[extract.DataChunk]:
# SQLAdmin setup with OAuth protection
engine = get_engine()
admin = Admin(app, engine)
admin.app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # [settings.SERVER_URL],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Setup admin with OAuth protection using existing OAuth provider
setup_admin(admin)

View File

@ -8,7 +8,7 @@ import qdrant_client
from PIL import Image
from qdrant_client.http import models as qdrant_models
from memory.common import embedding, extract, qdrant
from memory.common import embedding, extract, qdrant, settings
from memory.common.db.connection import make_session
from memory.common.db.models import Chunk
from memory.api.search.utils import SourceData, AnnotatedChunk, SearchFilters
@ -22,9 +22,16 @@ def annotated_chunk(
def serialize_item(item: bytes | str | Image.Image) -> str | None:
if not previews and not isinstance(item, str):
return None
if not previews and isinstance(item, str):
return item[:100]
if (
not previews
and isinstance(item, str)
and len(item) > settings.MAX_NON_PREVIEW_LENGTH
):
return item[: settings.MAX_NON_PREVIEW_LENGTH] + "..."
elif isinstance(item, str):
if len(item) > settings.MAX_PREVIEW_LENGTH:
return None
return item
if isinstance(item, Image.Image):
buffer = io.BytesIO()
format = item.format or "PNG"
@ -33,8 +40,6 @@ def annotated_chunk(
return f"data:{mime_type};base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
elif isinstance(item, bytes):
return base64.b64encode(item).decode("utf-8")
elif isinstance(item, str):
return item
else:
raise ValueError(f"Unsupported item type: {type(item)}")

View File

@ -42,7 +42,7 @@ class SourceData(BaseModel):
mime_type=source.mime_type,
filename=source.filename,
content_length=len(source.content) if source.content else 0,
contents=display_contents,
contents={k: v for k, v in display_contents.items() if v is not None},
created_at=source.inserted_at,
)
@ -104,10 +104,9 @@ def group_chunks(
source_lookup[source.id] = source
def get_content(text: str | dict | None) -> str | dict | None:
if preview or not text or not isinstance(text, str) or len(text) < 250:
return text
return text[:250] + "..."
if isinstance(text, str) and len(text) > settings.MAX_PREVIEW_LENGTH:
return None
return text
def make_result(source: SourceData, chunks: list[AnnotatedChunk]) -> SearchResult:
contents = source.contents or {}

View File

@ -369,9 +369,11 @@ class SourceItem(Base):
@property
def display_contents(self) -> str | dict | None:
payload = self.as_payload()
payload.pop("id", None) # type: ignore
return {
**payload,
"tags": self.tags,
"size": self.size,
"content": self.content,
"filename": self.filename,
"mime_type": self.mime_type,

View File

@ -135,6 +135,8 @@ SUMMARIZER_MODEL = os.getenv("SUMMARIZER_MODEL", "anthropic/claude-3-haiku-20240
# Search settings
ENABLE_EMBEDDING_SEARCH = boolean_env("ENABLE_EMBEDDING_SEARCH", True)
ENABLE_BM25_SEARCH = boolean_env("ENABLE_BM25_SEARCH", True)
MAX_PREVIEW_LENGTH = int(os.getenv("MAX_PREVIEW_LENGTH", DEFAULT_CHUNK_TOKENS * 8))
MAX_NON_PREVIEW_LENGTH = int(os.getenv("MAX_NON_PREVIEW_LENGTH", 2000))
# API settings
SERVER_URL = os.getenv("SERVER_URL", "http://localhost:8000")

View File

@ -30,7 +30,7 @@ def create_book_from_ebook(ebook, tags: Iterable[str] = []) -> Book:
publisher=ebook.metadata.get("creator"),
language=ebook.metadata.get("language"),
total_pages=ebook.n_pages,
file_path=ebook.file_path.as_posix(),
file_path=ebook.file_path.relative_to(settings.FILE_STORAGE_DIR).as_posix(),
book_metadata=ebook.metadata,
tags=tags,
)