better search

This commit is contained in:
EC2 Default User 2025-06-25 23:43:14 +00:00
parent 5e836337e2
commit 8855e8715a
9 changed files with 66 additions and 42 deletions

View File

@ -1,6 +1,7 @@
import { useState, useEffect } from 'react' import { useState, useEffect } from 'react'
import ReactMarkdown from 'react-markdown' import ReactMarkdown from 'react-markdown'
import { useMCP } from '@/hooks/useMCP' import { useMCP } from '@/hooks/useMCP'
import { SERVER_URL } from '@/hooks/useAuth'
export type SearchItem = { export type SearchItem = {
filename: string filename: string
@ -24,10 +25,10 @@ export const Tag = ({ tags }: { tags: string[] }) => {
export const TextResult = ({ filename, content, chunks, tags, metadata }: SearchItem) => { export const TextResult = ({ filename, content, chunks, tags, metadata }: SearchItem) => {
return ( return (
<div className="search-result-card"> <div className="search-result-card">
<h4>{filename || 'Untitled'}</h4> <h4>{filename || metadata?.title || metadata?.url || 'Untitled'}</h4>
<Tag tags={tags} /> <Tag tags={tags} />
<Metadata metadata={metadata} /> <Metadata metadata={metadata} />
<p className="result-content">{content || 'No content available'}</p> {content && <p className="result-content">{content}</p>}
{chunks && chunks.length > 0 && ( {chunks && chunks.length > 0 && (
<details className="result-chunks"> <details className="result-chunks">
<summary>Relevant sections:</summary> <summary>Relevant sections:</summary>
@ -46,12 +47,14 @@ export const TextResult = ({ filename, content, chunks, tags, metadata }: Search
export const MarkdownResult = ({ filename, content, chunks, tags, metadata }: SearchItem) => { export const MarkdownResult = ({ filename, content, chunks, tags, metadata }: SearchItem) => {
return ( return (
<div className="search-result-card"> <div className="search-result-card">
<h4>{filename || 'Untitled'}</h4> <h4>{filename || metadata?.title || metadata?.url || 'Untitled'}</h4>
<Tag tags={tags} /> <Tag tags={tags} />
<Metadata metadata={metadata} /> <Metadata metadata={metadata} />
<div className="markdown-content"> {content && (
<ReactMarkdown>{content || 'No content available'}</ReactMarkdown> <div className="markdown-content">
</div> <ReactMarkdown>{content}</ReactMarkdown>
</div>
)}
{chunks && chunks.length > 0 && ( {chunks && chunks.length > 0 && (
<details className="result-chunks"> <details className="result-chunks">
<summary>Relevant sections:</summary> <summary>Relevant sections:</summary>
@ -76,7 +79,7 @@ export const ImageResult = ({ filename, tags, metadata }: SearchItem) => {
const [content, setContent] = useState<string>() const [content, setContent] = useState<string>()
useEffect(() => { useEffect(() => {
const fetchImage = async () => { const fetchImage = async () => {
const files = await fetchFile(filename.replace('/app/memory_files/', '')) const files = await fetchFile(filename)
const {mime_type, content} = files[0] const {mime_type, content} = files[0]
setMimeType(mime_type) setMimeType(mime_type)
setContent(content) setContent(content)
@ -94,13 +97,26 @@ export const ImageResult = ({ filename, tags, metadata }: SearchItem) => {
) )
} }
const MetadataItem = ({ item, value }: { item: string, value: any }) => {
if (item === "url") {
return <li><a href={value}>{value}</a></li>
}
if (item === "filename") {
return <li><a href={`${SERVER_URL}/files/${value}`}>{value}</a></li>
}
if (typeof value === 'string') {
return <li>{item}: {value}</li>
}
return <li>{item}: {JSON.stringify(value)}</li>
}
export const Metadata = ({ metadata }: { metadata: any }) => { export const Metadata = ({ metadata }: { metadata: any }) => {
if (!metadata) return null if (!metadata) return null
return ( return (
<div className="metadata"> <div className="metadata">
<ul> <ul>
{Object.entries(metadata).map(([key, value]) => ( {Object.entries(metadata).map(([key, value]) => (
<li key={key}>{key}: {typeof value === 'string' ? value : JSON.stringify(value)}</li> <MetadataItem key={key} item={key} value={value} />
))} ))}
</ul> </ul>
</div> </div>
@ -112,7 +128,7 @@ export const PDFResult = ({ filename, content, tags, metadata }: SearchItem) =>
<div className="search-result-card"> <div className="search-result-card">
<h4>{filename || 'Untitled'}</h4> <h4>{filename || 'Untitled'}</h4>
<Tag tags={tags} /> <Tag tags={tags} />
<a href={`http://localhost:8000/files/${filename}`}>View PDF</a> <a href={`${SERVER_URL}/files/${filename}`}>View PDF</a>
<Metadata metadata={metadata} /> <Metadata metadata={metadata} />
{content && <div className="markdown-content"> {content && <div className="markdown-content">
<details> <details>

View File

@ -1,7 +1,7 @@
import { useState, useEffect, useCallback } from 'react' import { useState, useEffect, useCallback } from 'react'
const SERVER_URL = import.meta.env.VITE_SERVER_URL || 'http://localhost:8000' export const SERVER_URL = import.meta.env.VITE_SERVER_URL || 'http://localhost:8000'
const SESSION_COOKIE_NAME = import.meta.env.VITE_SESSION_COOKIE_NAME || 'session_id' export const SESSION_COOKIE_NAME = import.meta.env.VITE_SESSION_COOKIE_NAME || 'session_id'
// Cookie utilities // Cookie utilities
const getCookie = (name: string) => { const getCookie = (name: string) => {

View File

@ -96,6 +96,7 @@ class EmailAttachmentAdmin(ModelView, model=EmailAttachment):
column_searchable_list = [ column_searchable_list = [
"filename", "filename",
"mime_type", "mime_type",
"id",
] ]
@ -103,7 +104,7 @@ class BlogPostAdmin(ModelView, model=BlogPost):
column_list = source_columns( column_list = source_columns(
BlogPost, "title", "author", "url", "published", "domain" BlogPost, "title", "author", "url", "published", "domain"
) )
column_searchable_list = ["title", "author", "domain"] column_searchable_list = ["title", "author", "domain", "id", "url"]
class ForumPostAdmin(ModelView, model=ForumPost): class ForumPostAdmin(ModelView, model=ForumPost):
@ -118,7 +119,7 @@ class ForumPostAdmin(ModelView, model=ForumPost):
"comments", "comments",
"score", "score",
) )
column_searchable_list = ["title", "authors"] column_searchable_list = ["title", "authors", "id"]
class PhotoAdmin(ModelView, model=Photo): class PhotoAdmin(ModelView, model=Photo):
@ -127,7 +128,7 @@ class PhotoAdmin(ModelView, model=Photo):
class ComicAdmin(ModelView, model=Comic): class ComicAdmin(ModelView, model=Comic):
column_list = source_columns(Comic, "title", "author", "published", "volume") column_list = source_columns(Comic, "title", "author", "published", "volume")
column_searchable_list = ["title", "author"] column_searchable_list = ["title", "author", "id"]
class BookSectionAdmin(ModelView, model=BookSection): class BookSectionAdmin(ModelView, model=BookSection):
@ -139,12 +140,12 @@ class BookSectionAdmin(ModelView, model=BookSection):
"start_page", "start_page",
"end_page", "end_page",
) )
column_searchable_list = ["section_title"] column_searchable_list = ["section_title", "id"]
class MiscDocAdmin(ModelView, model=MiscDoc): class MiscDocAdmin(ModelView, model=MiscDoc):
column_list = source_columns(MiscDoc, "path") column_list = source_columns(MiscDoc, "path")
column_searchable_list = ["path"] column_searchable_list = ["path", "id"]
class BookAdmin(ModelView, model=Book): class BookAdmin(ModelView, model=Book):
@ -156,7 +157,7 @@ class BookAdmin(ModelView, model=Book):
"series_number", "series_number",
"published", "published",
] ]
column_searchable_list = ["title", "author"] column_searchable_list = ["title", "author", "id"]
class ArticleFeedAdmin(ModelView, model=ArticleFeed): class ArticleFeedAdmin(ModelView, model=ArticleFeed):
@ -170,7 +171,7 @@ class ArticleFeedAdmin(ModelView, model=ArticleFeed):
"created_at", "created_at",
"updated_at", "updated_at",
] ]
column_searchable_list = ["title", "url"] column_searchable_list = ["title", "url", "id"]
class EmailAccountAdmin(ModelView, model=EmailAccount): class EmailAccountAdmin(ModelView, model=EmailAccount):
@ -186,7 +187,7 @@ class EmailAccountAdmin(ModelView, model=EmailAccount):
"created_at", "created_at",
"updated_at", "updated_at",
] ]
column_searchable_list = ["name", "email_address"] column_searchable_list = ["name", "email_address", "id"]
class AgentObservationAdmin(ModelView, model=AgentObservation): class AgentObservationAdmin(ModelView, model=AgentObservation):
@ -199,7 +200,7 @@ class AgentObservationAdmin(ModelView, model=AgentObservation):
"evidence", "evidence",
"inserted_at", "inserted_at",
] ]
column_searchable_list = ["subject", "observation_type"] column_searchable_list = ["subject", "observation_type", "id"]
column_default_sort = [("inserted_at", True)] column_default_sort = [("inserted_at", True)]
column_sortable_list = ["inserted_at"] column_sortable_list = ["inserted_at"]
@ -214,7 +215,7 @@ class NoteAdmin(ModelView, model=Note):
"tags", "tags",
"inserted_at", "inserted_at",
] ]
column_searchable_list = ["subject", "content"] column_searchable_list = ["subject", "content", "id"]
column_default_sort = [("inserted_at", True)] column_default_sort = [("inserted_at", True)]
column_sortable_list = ["inserted_at"] column_sortable_list = ["inserted_at"]

View File

@ -5,6 +5,7 @@ FastAPI application for the knowledge base.
import contextlib import contextlib
import os import os
import logging import logging
import mimetypes
from fastapi import FastAPI, UploadFile, Request, HTTPException from fastapi import FastAPI, UploadFile, Request, HTTPException
from fastapi.responses import FileResponse from fastapi.responses import FileResponse
@ -55,7 +56,12 @@ async def serve_file(path: str):
file_path = settings.FILE_STORAGE_DIR / path file_path = settings.FILE_STORAGE_DIR / path
if not file_path.is_file(): if not file_path.is_file():
raise HTTPException(status_code=404, detail="File not found") raise HTTPException(status_code=404, detail="File not found")
return FileResponse(file_path)
mime_type, _ = mimetypes.guess_type(str(file_path))
if mime_type is None:
mime_type = "application/octet-stream"
return FileResponse(file_path, media_type=mime_type)
async def input_type(item: str | UploadFile) -> list[extract.DataChunk]: async def input_type(item: str | UploadFile) -> list[extract.DataChunk]:
@ -71,13 +77,6 @@ async def input_type(item: str | UploadFile) -> list[extract.DataChunk]:
# SQLAdmin setup with OAuth protection # SQLAdmin setup with OAuth protection
engine = get_engine() engine = get_engine()
admin = Admin(app, engine) admin = Admin(app, engine)
admin.app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # [settings.SERVER_URL],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Setup admin with OAuth protection using existing OAuth provider # Setup admin with OAuth protection using existing OAuth provider
setup_admin(admin) setup_admin(admin)

View File

@ -8,7 +8,7 @@ import qdrant_client
from PIL import Image from PIL import Image
from qdrant_client.http import models as qdrant_models from qdrant_client.http import models as qdrant_models
from memory.common import embedding, extract, qdrant from memory.common import embedding, extract, qdrant, settings
from memory.common.db.connection import make_session from memory.common.db.connection import make_session
from memory.common.db.models import Chunk from memory.common.db.models import Chunk
from memory.api.search.utils import SourceData, AnnotatedChunk, SearchFilters from memory.api.search.utils import SourceData, AnnotatedChunk, SearchFilters
@ -22,9 +22,16 @@ def annotated_chunk(
def serialize_item(item: bytes | str | Image.Image) -> str | None: def serialize_item(item: bytes | str | Image.Image) -> str | None:
if not previews and not isinstance(item, str): if not previews and not isinstance(item, str):
return None return None
if not previews and isinstance(item, str): if (
return item[:100] not previews
and isinstance(item, str)
and len(item) > settings.MAX_NON_PREVIEW_LENGTH
):
return item[: settings.MAX_NON_PREVIEW_LENGTH] + "..."
elif isinstance(item, str):
if len(item) > settings.MAX_PREVIEW_LENGTH:
return None
return item
if isinstance(item, Image.Image): if isinstance(item, Image.Image):
buffer = io.BytesIO() buffer = io.BytesIO()
format = item.format or "PNG" format = item.format or "PNG"
@ -33,8 +40,6 @@ def annotated_chunk(
return f"data:{mime_type};base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}" return f"data:{mime_type};base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
elif isinstance(item, bytes): elif isinstance(item, bytes):
return base64.b64encode(item).decode("utf-8") return base64.b64encode(item).decode("utf-8")
elif isinstance(item, str):
return item
else: else:
raise ValueError(f"Unsupported item type: {type(item)}") raise ValueError(f"Unsupported item type: {type(item)}")

View File

@ -42,7 +42,7 @@ class SourceData(BaseModel):
mime_type=source.mime_type, mime_type=source.mime_type,
filename=source.filename, filename=source.filename,
content_length=len(source.content) if source.content else 0, content_length=len(source.content) if source.content else 0,
contents=display_contents, contents={k: v for k, v in display_contents.items() if v is not None},
created_at=source.inserted_at, created_at=source.inserted_at,
) )
@ -104,10 +104,9 @@ def group_chunks(
source_lookup[source.id] = source source_lookup[source.id] = source
def get_content(text: str | dict | None) -> str | dict | None: def get_content(text: str | dict | None) -> str | dict | None:
if preview or not text or not isinstance(text, str) or len(text) < 250: if isinstance(text, str) and len(text) > settings.MAX_PREVIEW_LENGTH:
return text return None
return text
return text[:250] + "..."
def make_result(source: SourceData, chunks: list[AnnotatedChunk]) -> SearchResult: def make_result(source: SourceData, chunks: list[AnnotatedChunk]) -> SearchResult:
contents = source.contents or {} contents = source.contents or {}

View File

@ -369,9 +369,11 @@ class SourceItem(Base):
@property @property
def display_contents(self) -> str | dict | None: def display_contents(self) -> str | dict | None:
payload = self.as_payload()
payload.pop("id", None) # type: ignore
return { return {
**payload,
"tags": self.tags, "tags": self.tags,
"size": self.size,
"content": self.content, "content": self.content,
"filename": self.filename, "filename": self.filename,
"mime_type": self.mime_type, "mime_type": self.mime_type,

View File

@ -135,6 +135,8 @@ SUMMARIZER_MODEL = os.getenv("SUMMARIZER_MODEL", "anthropic/claude-3-haiku-20240
# Search settings # Search settings
ENABLE_EMBEDDING_SEARCH = boolean_env("ENABLE_EMBEDDING_SEARCH", True) ENABLE_EMBEDDING_SEARCH = boolean_env("ENABLE_EMBEDDING_SEARCH", True)
ENABLE_BM25_SEARCH = boolean_env("ENABLE_BM25_SEARCH", True) ENABLE_BM25_SEARCH = boolean_env("ENABLE_BM25_SEARCH", True)
MAX_PREVIEW_LENGTH = int(os.getenv("MAX_PREVIEW_LENGTH", DEFAULT_CHUNK_TOKENS * 8))
MAX_NON_PREVIEW_LENGTH = int(os.getenv("MAX_NON_PREVIEW_LENGTH", 2000))
# API settings # API settings
SERVER_URL = os.getenv("SERVER_URL", "http://localhost:8000") SERVER_URL = os.getenv("SERVER_URL", "http://localhost:8000")

View File

@ -30,7 +30,7 @@ def create_book_from_ebook(ebook, tags: Iterable[str] = []) -> Book:
publisher=ebook.metadata.get("creator"), publisher=ebook.metadata.get("creator"),
language=ebook.metadata.get("language"), language=ebook.metadata.get("language"),
total_pages=ebook.n_pages, total_pages=ebook.n_pages,
file_path=ebook.file_path.as_posix(), file_path=ebook.file_path.relative_to(settings.FILE_STORAGE_DIR).as_posix(),
book_metadata=ebook.metadata, book_metadata=ebook.metadata,
tags=tags, tags=tags,
) )