mirror of
https://github.com/mruwnik/memory.git
synced 2025-06-28 15:14:45 +02:00
initial structure
This commit is contained in:
commit
a003ada9b7
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
.env
|
||||
.DS_Store
|
||||
secrets/
|
||||
acme.json
|
||||
__pycache__/
|
229
db/schema.sql
Normal file
229
db/schema.sql
Normal file
@ -0,0 +1,229 @@
|
||||
/*========================================================================
|
||||
Knowledge-Base schema – first-run script
|
||||
---------------------------------------------------------------
|
||||
• PostgreSQL 15+
|
||||
• Creates every table, index, trigger and helper in one pass
|
||||
• No ALTER statements or later migrations required
|
||||
• Enable pgcrypto for UUID helpers (safe to re-run)
|
||||
========================================================================*/
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- 0. EXTENSIONS
|
||||
-------------------------------------------------------------------------------
|
||||
CREATE EXTENSION IF NOT EXISTS pgcrypto; -- gen_random_uuid(), crypt()
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- 1. CANONICAL ARTEFACT TABLE (everything points here)
|
||||
-------------------------------------------------------------------------------
|
||||
CREATE TABLE source_item (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
modality TEXT NOT NULL, -- 'mail'|'chat'|...
|
||||
sha256 BYTEA UNIQUE NOT NULL, -- 32-byte blob
|
||||
inserted_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
tags TEXT[] NOT NULL DEFAULT '{}', -- flexible labels
|
||||
lang TEXT, -- ISO-639-1 or NULL
|
||||
model_hash TEXT, -- embedding model ver.
|
||||
vector_ids TEXT[] NOT NULL DEFAULT '{}', -- 0-N Qdrant IDs
|
||||
embed_status TEXT NOT NULL DEFAULT 'RAW'
|
||||
CHECK (embed_status IN ('RAW','QUEUED','STORED','FAILED')),
|
||||
byte_length INTEGER, -- original size
|
||||
mime_type TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX source_modality_idx ON source_item (modality);
|
||||
CREATE INDEX source_status_idx ON source_item (embed_status);
|
||||
CREATE INDEX source_tags_idx ON source_item USING GIN (tags);
|
||||
|
||||
-- 1.a Trigger – vector_ids must be present when status = STORED
|
||||
CREATE OR REPLACE FUNCTION trg_vector_ids_not_empty()
|
||||
RETURNS TRIGGER LANGUAGE plpgsql AS $$
|
||||
BEGIN
|
||||
IF NEW.embed_status = 'STORED'
|
||||
AND (NEW.vector_ids IS NULL OR array_length(NEW.vector_ids,1) = 0) THEN
|
||||
RAISE EXCEPTION
|
||||
USING MESSAGE = 'vector_ids must not be empty when embed_status = STORED';
|
||||
END IF;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$;
|
||||
CREATE TRIGGER check_vector_ids
|
||||
BEFORE UPDATE ON source_item
|
||||
FOR EACH ROW EXECUTE FUNCTION trg_vector_ids_not_empty();
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- 2. MAIL MESSAGES
|
||||
-------------------------------------------------------------------------------
|
||||
CREATE TABLE mail_message (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
|
||||
message_id TEXT UNIQUE,
|
||||
subject TEXT,
|
||||
sender TEXT,
|
||||
recipients TEXT[],
|
||||
sent_at TIMESTAMPTZ,
|
||||
body_raw TEXT,
|
||||
attachments JSONB
|
||||
);
|
||||
|
||||
CREATE INDEX mail_sent_idx ON mail_message (sent_at);
|
||||
CREATE INDEX mail_recipients_idx ON mail_message USING GIN (recipients);
|
||||
|
||||
ALTER TABLE mail_message
|
||||
ADD COLUMN tsv tsvector
|
||||
GENERATED ALWAYS AS (
|
||||
to_tsvector('english',
|
||||
coalesce(subject,'') || ' ' || coalesce(body_raw,'')))
|
||||
STORED;
|
||||
CREATE INDEX mail_tsv_idx ON mail_message USING GIN (tsv);
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- 3. CHAT (Slack / Discord)
|
||||
-------------------------------------------------------------------------------
|
||||
CREATE TABLE chat_message (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
|
||||
platform TEXT CHECK (platform IN ('slack','discord')),
|
||||
channel_id TEXT,
|
||||
author TEXT,
|
||||
sent_at TIMESTAMPTZ,
|
||||
body_raw TEXT
|
||||
);
|
||||
CREATE INDEX chat_channel_idx ON chat_message (platform, channel_id);
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- 4. GIT COMMITS (local repos)
|
||||
-------------------------------------------------------------------------------
|
||||
CREATE TABLE git_commit (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
|
||||
repo_path TEXT,
|
||||
commit_sha TEXT UNIQUE,
|
||||
author_name TEXT,
|
||||
author_email TEXT,
|
||||
author_date TIMESTAMPTZ,
|
||||
msg_raw TEXT,
|
||||
diff_summary TEXT,
|
||||
files_changed TEXT[]
|
||||
);
|
||||
CREATE INDEX git_files_idx ON git_commit USING GIN (files_changed);
|
||||
CREATE INDEX git_date_idx ON git_commit (author_date);
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- 5. PHOTOS
|
||||
-------------------------------------------------------------------------------
|
||||
CREATE TABLE photo (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
|
||||
file_path TEXT,
|
||||
exif_taken_at TIMESTAMPTZ,
|
||||
exif_lat NUMERIC(9,6),
|
||||
exif_lon NUMERIC(9,6),
|
||||
camera_make TEXT,
|
||||
camera_model TEXT
|
||||
);
|
||||
CREATE INDEX photo_taken_idx ON photo (exif_taken_at);
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- 6. BOOKS, BLOG POSTS, MISC DOCS
|
||||
-------------------------------------------------------------------------------
|
||||
CREATE TABLE book_doc (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
|
||||
title TEXT,
|
||||
author TEXT,
|
||||
chapter TEXT,
|
||||
published DATE
|
||||
);
|
||||
|
||||
CREATE TABLE blog_post (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
|
||||
url TEXT UNIQUE,
|
||||
title TEXT,
|
||||
published TIMESTAMPTZ
|
||||
);
|
||||
|
||||
CREATE TABLE misc_doc (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
|
||||
path TEXT,
|
||||
mime_type TEXT
|
||||
);
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- 6.5 RSS FEEDS
|
||||
-------------------------------------------------------------------------------
|
||||
CREATE TABLE rss_feeds (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
url TEXT UNIQUE NOT NULL,
|
||||
title TEXT,
|
||||
description TEXT,
|
||||
tags TEXT[] NOT NULL DEFAULT '{}',
|
||||
last_checked_at TIMESTAMPTZ,
|
||||
active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX rss_feeds_active_idx ON rss_feeds (active, last_checked_at);
|
||||
CREATE INDEX rss_feeds_tags_idx ON rss_feeds USING GIN (tags);
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- 7. GITHUB ITEMS (issues, PRs, comments, project cards)
|
||||
-------------------------------------------------------------------------------
|
||||
CREATE TYPE gh_item_kind AS ENUM ('issue','pr','comment','project_card');
|
||||
|
||||
CREATE TABLE github_item (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
|
||||
|
||||
kind gh_item_kind NOT NULL,
|
||||
repo_path TEXT NOT NULL, -- "owner/repo"
|
||||
number INTEGER, -- issue/PR number (NULL for commit comment)
|
||||
parent_number INTEGER, -- comment → its issue/PR
|
||||
commit_sha TEXT, -- for commit comments
|
||||
state TEXT, -- 'open'|'closed'|'merged'
|
||||
title TEXT,
|
||||
body_raw TEXT,
|
||||
labels TEXT[],
|
||||
author TEXT,
|
||||
created_at TIMESTAMPTZ,
|
||||
closed_at TIMESTAMPTZ,
|
||||
merged_at TIMESTAMPTZ,
|
||||
diff_summary TEXT, -- PR only
|
||||
|
||||
payload JSONB -- extra GitHub fields
|
||||
);
|
||||
|
||||
CREATE INDEX gh_repo_kind_idx ON github_item (repo_path, kind);
|
||||
CREATE INDEX gh_issue_lookup_idx ON github_item (repo_path, kind, number);
|
||||
CREATE INDEX gh_labels_idx ON github_item USING GIN (labels);
|
||||
|
||||
CREATE INDEX gh_tsv_idx ON github_item
|
||||
WHERE kind IN ('issue','pr')
|
||||
USING GIN (to_tsvector('english',
|
||||
coalesce(title,'') || ' ' || coalesce(body_raw,'')));
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- 8. HELPER FUNCTION – add tags
|
||||
-------------------------------------------------------------------------------
|
||||
CREATE OR REPLACE FUNCTION add_tags(p_source BIGINT, p_tags TEXT[])
|
||||
RETURNS VOID LANGUAGE SQL AS $$
|
||||
UPDATE source_item
|
||||
SET tags =
|
||||
(SELECT ARRAY(SELECT DISTINCT unnest(tags || p_tags)))
|
||||
WHERE id = p_source;
|
||||
$$;
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- 9. (optional) PARTITION STUBS – create per-year partitions later
|
||||
-------------------------------------------------------------------------------
|
||||
/*
|
||||
-- example:
|
||||
CREATE TABLE mail_message_2026 PARTITION OF mail_message
|
||||
FOR VALUES FROM ('2026-01-01') TO ('2027-01-01');
|
||||
*/
|
||||
|
||||
-- =========================================================================
|
||||
-- Schema creation complete
|
||||
-- =========================================================================
|
401
design/plan.md
Normal file
401
design/plan.md
Normal file
@ -0,0 +1,401 @@
|
||||
# Personal Multimodal Knowledge-Base — Design Document v1.0
|
||||
*(self-hosted, privacy-first)*
|
||||
|
||||
---
|
||||
|
||||
## 1 Purpose
|
||||
|
||||
Build a single private system that lets **one user** ask natural-language questions (text or image) and instantly search:
|
||||
|
||||
* E-mail (IMAP or local mbox)
|
||||
* Slack & Discord messages
|
||||
* Git commit history
|
||||
* Photos (≈ 200 GB)
|
||||
* Books (EPUB/PDF), important blog posts / RSS items
|
||||
* Misc. documents & meeting transcripts
|
||||
|
||||
…and receive answers that combine the most relevant passages, pictures and commits.
|
||||
|
||||
---
|
||||
|
||||
## 2 High-level Architecture
|
||||
|
||||
```
|
||||
┌──────────────────────────┐
|
||||
Internet → │ Ingestion Workers │
|
||||
│ (Celery queues) │
|
||||
│ mail / chat / git / ... │
|
||||
└─────┬──────────┬─────────┘
|
||||
│TEXT │IMAGE/ATT
|
||||
▼ ▼
|
||||
┌──────────┐ ┌──────────┐
|
||||
│Embedding │ │OCR / │
|
||||
│Workers │ │Vision │
|
||||
└────┬──────┘ └────┬─────┘
|
||||
│ vectors │ captions/tags
|
||||
▼ ▼
|
||||
Postgres 15 (canonical) ◄───► Qdrant 1.9 (vectors)
|
||||
• raw bodies / metadata • per-modality collections
|
||||
• tags[] array, GIN index • payload filter inde# Personal Multimodal Knowledge-Base — **Design Document v1.1**
|
||||
|
||||
*(self-hosted, privacy-first; incorporates external feedback except the “LLaVA-speed” concern, which is intentionally ignored)*
|
||||
|
||||
---
|
||||
|
||||
## 1 Purpose
|
||||
|
||||
Provide a **single-user** system that answers natural-language questions about the owner’s entire digital corpus—e-mails, chats, code history, photos, books, blog posts, RSS items and ad-hoc documents—while keeping all data fully under personal control.
|
||||
|
||||
---
|
||||
|
||||
## 2 Target Workload & Service-Levels
|
||||
|
||||
| Metric | Year-1 | 5-year |
|
||||
|--------|--------|--------|
|
||||
| Text artefacts | ≈ 5 M | ≈ 25 M |
|
||||
| Photos | ≈ 200 k (≈ 200 GB) | ≈ 600 k (≈ 600 GB) |
|
||||
| Concurrency | 1 interactive seat + background jobs |
|
||||
| **p95 answer latency** | ≤ 2 s (GPT-4o) |
|
||||
| Uptime goal | “Home-lab” best-effort, but automatic recovery from single-component failures |
|
||||
|
||||
---
|
||||
|
||||
## 3 Hardware Specification ‼ BREAKING
|
||||
|
||||
| Component | Spec | Notes |
|
||||
|-----------|------|-------|
|
||||
| CPU | 8-core / 16-thread (NUC 13 Pro i7 or similar) |
|
||||
| **RAM** | **32 GB ECC** |
|
||||
| **GPU** | **Low-profile RTX A2000 (6 GB)** — accelerates CLIP & local LLaVA |
|
||||
| Storage | 2 TB NVMe (data) + 2 TB SATA SSD (offline backup/ZFS snapshot target) |
|
||||
| Power | ≈ 10 W idle, 55 W peak |
|
||||
|
||||
---
|
||||
|
||||
## 4 Software Stack
|
||||
|
||||
| Layer | Tech |
|
||||
|-------|------|
|
||||
| OS | Ubuntu 22.04 LTS (automatic security updates) |
|
||||
| Container runtime | Docker 24 + docker-compose v2 |
|
||||
| **Message broker** | RabbitMQ 3.13 (priority queues, DLQ) |
|
||||
| Database | PostgreSQL 15 |
|
||||
| Vector DB | Qdrant 1.9 |
|
||||
| Task engine | Celery 5 (broker = RabbitMQ, result-backend = Postgres) |
|
||||
| Web/API | FastAPI + Uvicorn |
|
||||
| Back-end LLMs | GPT-4o (API) **and** optional on-device LLaVA-1.6-Q4 (GPU) |
|
||||
| Embeddings | OpenAI *text-embedding-3-small* (1536 d) • OpenCLIP ViT-B/32 (512 d) |
|
||||
|
||||
---
|
||||
|
||||
## 5 Data Sources & Ingestion Queues
|
||||
|
||||
| Source | Trigger | Parser | Default **tags[]** |
|
||||
|--------|---------|--------|--------------------|
|
||||
| **E-mail IMAP** | UID poll 10 min | `imap_tools` | `work` if address ends `@corp.com` |
|
||||
| **Slack** | Socket-mode WS | `slack_sdk` | `work` on `#proj-*` |
|
||||
| **Discord** | Gateway WS | `discord.py` | `personal` |
|
||||
| **Git** | `post-receive` hook / hourly fetch | `GitPython` + LLM diff summary | `work` if remote host in allow-list |
|
||||
| **Photos** | `watchdog` folder | `Pillow`, EXIF; CLIP embed; FaceNet | `personal` unless GPS in office polygon |
|
||||
| **Books (EPUB/PDF)** | Nightly folder scan | `ebooklib`, `pdfminer`, OCR | `reference` |
|
||||
| **Blog / RSS** | `feedparser` 30 min | `trafilatura` | `reference` |
|
||||
| **Misc docs / transcripts** | `watchdog` inbox | PDF→OCR, DOCX→txt, VTT→txt | deduced from path |
|
||||
|
||||
---
|
||||
|
||||
## 6 Data Model
|
||||
|
||||
### 6.1 PostgreSQL (tables share columns)
|
||||
|
||||
```sql
|
||||
id bigserial primary key,
|
||||
sha256 bytea unique,
|
||||
inserted_at timestamptz default now(),
|
||||
tags text[] not null default '{}', -- flexible labelling
|
||||
lang text, -- detected language
|
||||
body_raw text, -- TOAST/LZ4
|
||||
vector_ids text[], -- 0-N vectors in Qdrant
|
||||
model_hash text -- hash of embedding model
|
||||
```
|
||||
|
||||
*GIN index on `tags`; range or JSONB indexes where relevant.*
|
||||
|
||||
### 6.2 Qdrant (collections)
|
||||
|
||||
| Collection | Model | Dim |
|
||||
|------------|-------|-----|
|
||||
| `mail`, `chat`, `git`, `book`, `blog`, `doc` | *text-embedding-3-small* | 1536 |
|
||||
| `photo` | OpenCLIP ViT-B/32 | 512 |
|
||||
|
||||
Payload fields: `tags`, per-domain metadata (EXIF, author, files_changed[] …).
|
||||
|
||||
---
|
||||
|
||||
## 7 Task Queues & Concurrency
|
||||
|
||||
| Celery queue | Priority | Concurrency | Typical load |
|
||||
|--------------|----------|-------------|--------------|
|
||||
| `interactive` | 9 | auto (1 per core) | query embedding + GPT-4o calls |
|
||||
| `medium_embed` | 5 | 4 | mail/chat embeddings |
|
||||
| `low_ocr` | 2 | **≤ physical cores – 2** | PDF/image OCR |
|
||||
| `photo_embed_gpu` | 5 | GPU | CLIP image vectors |
|
||||
| `git_summary` | 4 | 2 | LLM diff summaries |
|
||||
| All queues have DLQ → `failed_tasks` exchange (RabbitMQ). |
|
||||
|
||||
---
|
||||
|
||||
## 8 Vector Consistency & Repair
|
||||
|
||||
* **Up-front write:** worker inserts into Postgres, then Qdrant; the returned `vector_id` is stored in `vector_ids[]`.
|
||||
* **Audit Cron (5 min):**
|
||||
* Find rows where `vector_ids = '{}'` or with `model_hash ≠ CURRENT_HASH`.
|
||||
* Re-enqueue to appropriate embed queue.
|
||||
* **Qdrant-centric diff (hourly):** dump collection IDs → compare against Postgres; orphans are deleted, missing vectors are re-enqueued.
|
||||
* **Disaster Re-build:** documented script streams `id,chunk_text` to embed queue (rate-limited).
|
||||
|
||||
---
|
||||
|
||||
## 9 Embedding-Model Versioning
|
||||
|
||||
* Compute `MODEL_HASH = sha256(model_name + version + weights_SHA)` at worker start.
|
||||
* Model change → hashes differ → audit cron flags rows → background re-embed queue.
|
||||
* Router refuses to mix hashes unless `ALLOW_MIXED_MODE=1`.
|
||||
|
||||
---
|
||||
|
||||
## 10 Security Hardening
|
||||
|
||||
1. **JWT auth** on all API routes (HS256 secret in Docker secret store).
|
||||
2. **Rate limiter** (`slowapi`): 60 req / min / IP.
|
||||
3. **Filesystem isolation**
|
||||
* Containers run as UID 1000, read-only bind mounts for `/photos`, `/books`.
|
||||
4. **TLS everywhere** (Traefik + Let’s Encrypt on LAN or Tailscale certs).
|
||||
5. **Input sanitisation**: Markdown-escape bodies; regex filter for SSNs/credit-card patterns before LLM prompt.
|
||||
6. **Resource quotas** in compose (`mem_limit`, `pids_limit`).
|
||||
|
||||
---
|
||||
|
||||
## 11 Backup & Restore
|
||||
|
||||
| Layer | Tool | Frequency | Storage cost (Glacier DA) |
|
||||
|-------|------|-----------|---------------------------|
|
||||
| Postgres basebackup + WAL | `pgBackRest` | nightly | included in dataset |
|
||||
| Qdrant | `qdrant-backup` tar of collection dir | nightly | vectors + graph ≈ 20 GB year-5 |
|
||||
| Files / attachments | Restic dedup | nightly | 400 GB -> ~€1.2 / mo |
|
||||
| **Grandfather-father-son** pruning (`7-4-6`). |
|
||||
|
||||
Restore script: ① create fresh volumes, ② `pgbackrest restore`, ③ `qdrant-restore`, ④ run audit cron to verify.
|
||||
|
||||
---
|
||||
|
||||
## 12 Monitoring & Alerting
|
||||
|
||||
* **Prometheus exporters**
|
||||
* node-exporter, postgres-exporter, rabbitmq-exporter, qdrant-exporter, cadvisor.
|
||||
* **Grafana dashboards**: CPU, RAM, queue depth, DLQ count, GPT-4o latency.
|
||||
* **Alertmanager rules**
|
||||
* `vector_audit_missing > 500` → warn
|
||||
* `node_filesystem_free_percent < 15` → critical
|
||||
* `rabbitmq_queue_messages{queue="failed_tasks"} > 0` → critical
|
||||
* `pg_up == 0` → critical
|
||||
|
||||
---
|
||||
|
||||
## 13 Query Flow
|
||||
|
||||
1. **Embed** user text with *text-embedding-3* and CLIP-text (one call each).
|
||||
2. **Determine scope** from conversation memory (`tags = 'work'` etc.).
|
||||
3. **Async search** each relevant collection (max 3 per batch) with payload filter.
|
||||
4. **Merge** top-k by score.
|
||||
5. Build **evidence JSON** (snippets, thumbnails, commit summaries).
|
||||
6. **LLM**
|
||||
* default: GPT-4o (vision) via API
|
||||
* offline mode: local LLaVA-1.6 Q4 on GPU
|
||||
7. Stream answer + thumbnails.
|
||||
*Expected p95 latency on spec hardware: **~2 s** (cloud) | **~1.4 s** (local LLaVA).*
|
||||
|
||||
---
|
||||
|
||||
## 14 Internationalisation
|
||||
|
||||
* Tesseract language packs specified via `OCR_LANGS=eng+pol+deu`.
|
||||
* `langdetect` sets `lang` column; router boosts same-language chunks.
|
||||
|
||||
---
|
||||
|
||||
## 15 Road-map
|
||||
|
||||
| Phase | Milestones |
|
||||
|-------|------------|
|
||||
| **0** | Hardware build, RabbitMQ, base compose up |
|
||||
| **1** | Mail, chat, photo, git ingestion & audit loop |
|
||||
| **2** | Backup scripts, security hardening, monitoring |
|
||||
| **3** | Books/RSS/misc docs + international OCR |
|
||||
| **4** | Tag-based multi-user RLS (optional) |
|
||||
|
||||
---
|
||||
|
||||
*End of Design Document v1.1*x
|
||||
• LISTEN/NOTIFY queue
|
||||
▲ ▲
|
||||
│ │
|
||||
└────────────┬─────────────────────┘
|
||||
│
|
||||
FastAPI “/chat”
|
||||
(router + merge)
|
||||
+ LangChain agent
|
||||
+ GPT-4o or local LLaVA
|
||||
```
|
||||
|
||||
*Everything runs in Docker-Compose on a low-power x86 mini-PC (NUC 11/12; 16 GB RAM, 1 TB NVMe).*
|
||||
|
||||
---
|
||||
|
||||
## 3 Data Sources & Ingestion
|
||||
|
||||
| Source | Trigger | Parser / Notes | Stored **tags** (default rules) |
|
||||
|--------|---------|----------------|---------------------------------|
|
||||
| **E-mail** (IMAP, mbox) | UID poll 10 min | `imap_tools`, strip quotes | `work` if address ends “@corp.com” |
|
||||
| **Slack** | Socket-mode WS | `slack_sdk`, flatten blocks | `work` if channel `#proj-*` |
|
||||
| **Discord** | Gateway WS | `discord.py`, role IDs | `personal` else |
|
||||
| **Git commits** | `post-receive` hook or hourly fetch | `GitPython` → diff; 3-sentence summary via LLM | `work` if remote in `github.com/corp` |
|
||||
| **Photos** | `watchdog` on folder | `Pillow`, EXIF; CLIP embed; FaceNet & optional YOLO tagger | `personal` unless GPS inside office |
|
||||
| **Books** (EPUB/PDF) | Nightly scan of `/books` | `ebooklib` / `pdfminer` (+OCR) | `reference` |
|
||||
| **Blog / RSS** | `feedparser` every 30 min | `trafilatura` HTML clean | `reference` |
|
||||
| **Misc. docs / transcripts** | `watchdog` on `/kb-inbox` | PDF->OCR, DOCX→txt, VTT/SRT stitch | inferred from path (`/work/` etc.) |
|
||||
|
||||
---
|
||||
|
||||
## 4 Storage Model
|
||||
|
||||
### 4.1 PostgreSQL (system-of-record)
|
||||
|
||||
* Base tables: `mail_msg`, `chat_msg`, `git_commit`, `photo`, `book_doc`, `blog_post`, `misc_doc`, `attachment`.
|
||||
* Common columns: `id bigserial`, `sha256 bytea`, `inserted_at timestamptz`, `tags text[] NOT NULL DEFAULT '{}'`, `vector_ids text[]`.
|
||||
* All large bodies are **TOAST/LZ4** compressed; photos/attachments > 5 MB stay on disk with a path pointer.
|
||||
* GIN indexes on `tags` for millisecond filtering.
|
||||
* LISTEN/NOTIFY drives Celery (no Redis needed, but Redis used by default).
|
||||
|
||||
### 4.2 Qdrant (similarity index)
|
||||
|
||||
| Collection | Model | Dim | Distance | Extra payload |
|
||||
|------------|-------|-----|----------|---------------|
|
||||
| `mail` | `text-embedding-3-small` | 1536 | Cosine | `tags`, `folder`, `from` |
|
||||
| `chat` | same | 1536 | Cosine | `channel_id`, `platform` |
|
||||
| `git` | same | 1536 | Cosine | `files_changed[]`, `author`, `tags` |
|
||||
| `photo` | OpenCLIP ViT-B/32 | 512 | Cosine | `exif_date`, `face_id`, `tags` |
|
||||
| `book`, `blog`, `doc` | same | 1536 | Cosine | `title`, `source_url`, `tags` |
|
||||
|
||||
---
|
||||
|
||||
## 5 Workers & Queues
|
||||
|
||||
| Queue | Concurrency | Task | Key libs |
|
||||
|-------|-------------|------|----------|
|
||||
| `text` | 4 CPU | Chunk + embed text | OpenAI Python SDK |
|
||||
| `image` | 2 CPU / GPU | Embed photo (CLIP) | `open_clip_torch` |
|
||||
| `ocr` | 8 CPU | OCR PDF/image | `ocrmypdf`, `tesseract-ocr` |
|
||||
| `git` | 2 CPU | Diff-summary → embed | GPT-4o mini or Φ-3-mini |
|
||||
| `rss` | 1 CPU | Fetch feed, parse article | `feedparser`, `trafilatura` |
|
||||
| `docs` | 2 CPU | Misc file parsing | `pdfminer`, `python-docx` |
|
||||
|
||||
Every queue auto-retries 3× with exponential back-off.
|
||||
|
||||
---
|
||||
|
||||
## 6 Tagging Framework
|
||||
|
||||
* YAML rule file; fields `sender_regex`, `path_regex`, `channel_regex`, `gps_polygon`, `add_tags[]`.
|
||||
* Workers call `apply_tags()` before inserting into Postgres/Qdrant.
|
||||
* CLI utility `retag add/remove <tag> (--where …)` for bulk fixes.
|
||||
* Tags are free-form strings; new tags require **no schema or index change** — Qdrant builds bitmap on first use.
|
||||
|
||||
---
|
||||
|
||||
## 7 Query & Chat Flow
|
||||
|
||||
1. **Router** embeds user text with
|
||||
* CLIP-text → hits `photo`
|
||||
* text-embed-3 → hits all text collections.
|
||||
2. Applies user-or conversation-scoped filter, e.g. `{"tags":{"value":"work"}}`.
|
||||
3. Parallel search (async) → merge top-k by score.
|
||||
4. Build “evidence bundle” (snippets, thumbs, commit msgs).
|
||||
5. Feed bundle + question to LLM:
|
||||
* cloud GPT-4o (vision) **or**
|
||||
* local LLaVA-1.6 + captions.
|
||||
6. Stream answer & thumbnails back.
|
||||
|
||||
Expected latency (NUC, GPT-4o): **≈ 1.3 s p95**.
|
||||
|
||||
---
|
||||
|
||||
## 8 Back-ups & DR
|
||||
|
||||
| Layer | Method | Retention | Cost |
|
||||
|-------|--------|-----------|------|
|
||||
| NVMe dataset | Restic dedup ⇒ **S3 Glacier Deep Archive** | 7 daily / 4 weekly / 6 monthly | First snapshot 250 GB → €0.9 / mo; delta ≈ €0.02 / mo |
|
||||
| Local roll-back | ZFS hourly snapshots (compressed) | 7 days | disk-only |
|
||||
| Restore test | Quarterly scripted restore to `/tmp/restore-test` | — | — |
|
||||
|
||||
---
|
||||
|
||||
## 9 Security
|
||||
|
||||
* Full-disk LUKS; if on AWS use encrypted EBS + **customer-managed KMS key**.
|
||||
* Instance in private subnet; access via Tailscale SSH or AWS SSM.
|
||||
* Docker containers run as non-root; seccomp default profile.
|
||||
* TLS termination in Traefik with auto-renewing Let’s Encrypt cert on LAN.
|
||||
|
||||
---
|
||||
|
||||
## 10 Hardware & Performance
|
||||
|
||||
| Component | Spec | Head-room |
|
||||
|-----------|------|-----------|
|
||||
| Mini-PC | 4-core i5 (11th gen) / 16 GB RAM | p95 memory < 9 GB |
|
||||
| Storage | 1 TB NVMe + ext. 1 TB SATA for backups | 5-year growth ≤ 400 GB |
|
||||
| Power | 6 W idle → €1.5 / mo | — |
|
||||
| GPU (optional) | Used RTX 2060 / T600 | Speeds 1st photo embed to < 1 h |
|
||||
|
||||
---
|
||||
|
||||
## 11 LLM & Model Abstraction
|
||||
|
||||
```python
|
||||
class EmbedProvider(ABC):
|
||||
def embed(self, text: str) -> list[float]: ...
|
||||
|
||||
provider = OpenAIProvider(model="text-embedding-3-small")
|
||||
# swap later:
|
||||
# provider = OllamaProvider(model="nomic-embed-text")
|
||||
|
||||
# injection via environment
|
||||
EMBED_BACKEND="openai" # or "ollama"
|
||||
```
|
||||
|
||||
Same interface for diff-summariser and chat-LLM; switching is one `docker-compose.yml` env var.
|
||||
|
||||
---
|
||||
|
||||
## 12 Monitoring & Ops
|
||||
|
||||
* **Prometheus + Grafana**: node load, Postgres WAL lag, queue depth.
|
||||
* **Watchtower** auto-updates images weekly (except Postgres & Qdrant).
|
||||
* Alertmanager e-mails if free disk < 15 % or any Celery worker dies.
|
||||
|
||||
---
|
||||
|
||||
## 13 Roadmap / Open Items
|
||||
|
||||
| Phase | Deliverable |
|
||||
|-------|-------------|
|
||||
| **0** (done) | Design document v1.0 |
|
||||
| **1** | Dockerfiles & compose stack; mail + chat + photo ingestion |
|
||||
| **2** | Git summariser + OCR worker; tag rules config |
|
||||
| **3** | Books, RSS, misc docs workers |
|
||||
| **4** | Live chat UI & LLaVA offline option |
|
||||
| **5** | Multi-user RLS & optional code-search add-on |
|
||||
|
||||
---
|
||||
|
||||
*Document ends — save for future implementation.*
|
239
docker-compose.yaml
Normal file
239
docker-compose.yaml
Normal file
@ -0,0 +1,239 @@
|
||||
version: "3.9"
|
||||
|
||||
# --------------------------------------------------------------------- networks
|
||||
networks:
|
||||
kbnet: # internal overlay – NOT exposed
|
||||
driver: bridge
|
||||
|
||||
# --------------------------------------------------------------------- secrets
|
||||
secrets:
|
||||
postgres_password: {file: ./secrets/postgres_password.txt}
|
||||
jwt_secret: {file: ./secrets/jwt_secret.txt}
|
||||
openai_key: {file: ./secrets/openai_key.txt}
|
||||
|
||||
# --------------------------------------------------------------------- volumes
|
||||
volumes:
|
||||
db_data: {} # Postgres
|
||||
qdrant_data: {} # Qdrant
|
||||
rabbitmq_data: {} # RabbitMQ
|
||||
|
||||
# ------------------------------ X-templates ----------------------------
|
||||
x-common-env: &env
|
||||
RABBITMQ_USER: kb
|
||||
TZ: "Etc/UTC"
|
||||
|
||||
|
||||
x-worker-base: &worker-base
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/workers/Dockerfile
|
||||
restart: unless-stopped
|
||||
networks: [kbnet]
|
||||
security_opt: ["no-new-privileges=true"]
|
||||
depends_on: [postgres, rabbitmq, qdrant]
|
||||
env_file: [.env]
|
||||
environment: &worker-env
|
||||
<<: *env
|
||||
POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
|
||||
# DSNs are built in worker entrypoint from user + pw files
|
||||
QDRANT_URL: http://qdrant:6333
|
||||
OPENAI_API_KEY_FILE: /run/secrets/openai_key
|
||||
secrets: [postgres_password, openai_key]
|
||||
read_only: true
|
||||
tmpfs: [/tmp,/var/tmp]
|
||||
cap_drop: [ALL]
|
||||
logging:
|
||||
options: {max-size: "10m", max-file: "3"}
|
||||
|
||||
# ================================ SERVICES ============================
|
||||
|
||||
services:
|
||||
# ----------------------------------------------------------------- data layer
|
||||
postgres:
|
||||
image: postgres:15
|
||||
restart: unless-stopped
|
||||
networks: [kbnet]
|
||||
environment:
|
||||
<<: *env
|
||||
POSTGRES_USER: kb
|
||||
POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
|
||||
POSTGRES_DB: kb
|
||||
secrets: [postgres_password]
|
||||
volumes:
|
||||
- db_data:/var/lib/postgresql/data:rw
|
||||
- ./db:/docker-entrypoint-initdb.d:ro
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U kb"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
mem_limit: 4g
|
||||
cpus: "1.5"
|
||||
security_opt: ["no-new-privileges=true"]
|
||||
|
||||
rabbitmq:
|
||||
image: rabbitmq:3.13-management
|
||||
restart: unless-stopped
|
||||
networks: [kbnet]
|
||||
environment:
|
||||
<<: *env
|
||||
RABBITMQ_DEFAULT_USER: "kb"
|
||||
RABBITMQ_DEFAULT_PASS: "${RABBITMQ_PASSWORD}"
|
||||
volumes:
|
||||
- rabbitmq_data:/var/lib/rabbitmq:rw
|
||||
healthcheck:
|
||||
test: ["CMD", "rabbitmq-diagnostics", "ping"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
mem_limit: 512m
|
||||
cpus: "0.5"
|
||||
security_opt: ["no-new-privileges=true"]
|
||||
ports: # UI only on localhost
|
||||
- "127.0.0.1:15672:15672"
|
||||
|
||||
qdrant:
|
||||
image: qdrant/qdrant:v1.14.0
|
||||
restart: unless-stopped
|
||||
networks: [kbnet]
|
||||
volumes:
|
||||
- qdrant_data:/qdrant/storage:rw
|
||||
tmpfs:
|
||||
- /tmp
|
||||
- /var/tmp
|
||||
- /qdrant/snapshots:rw
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-q", "-T", "2", "-O", "-", "localhost:6333/ready"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
mem_limit: 4g
|
||||
cpus: "2"
|
||||
security_opt: ["no-new-privileges=true"]
|
||||
cap_drop: [ALL]
|
||||
|
||||
# ------------------------------------------------------------ API / gateway
|
||||
# api:
|
||||
# build:
|
||||
# context: .
|
||||
# dockerfile: docker/api/Dockerfile
|
||||
# restart: unless-stopped
|
||||
# networks: [kbnet]
|
||||
# depends_on: [postgres, rabbitmq, qdrant]
|
||||
# environment:
|
||||
# <<: *env
|
||||
# JWT_SECRET_FILE: /run/secrets/jwt_secret
|
||||
# OPENAI_API_KEY_FILE: /run/secrets/openai_key
|
||||
# POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
|
||||
# QDRANT_URL: http://qdrant:6333
|
||||
# secrets: [jwt_secret, openai_key, postgres_password]
|
||||
# healthcheck:
|
||||
# test: ["CMD-SHELL", "curl -fs http://localhost:8000/health || exit 1"]
|
||||
# interval: 15s
|
||||
# timeout: 5s
|
||||
# retries: 5
|
||||
# mem_limit: 768m
|
||||
# cpus: "1"
|
||||
# labels:
|
||||
# - "traefik.enable=true"
|
||||
# - "traefik.http.routers.kb.rule=Host(`${TRAEFIK_DOMAIN}`)"
|
||||
# - "traefik.http.routers.kb.entrypoints=websecure"
|
||||
# - "traefik.http.services.kb.loadbalancer.server.port=8000"
|
||||
|
||||
traefik:
|
||||
image: traefik:v3.0
|
||||
restart: unless-stopped
|
||||
networks: [kbnet]
|
||||
command:
|
||||
- "--providers.docker=true"
|
||||
- "--providers.docker.network=kbnet"
|
||||
- "--entrypoints.web.address=:80"
|
||||
- "--entrypoints.websecure.address=:443"
|
||||
# - "--certificatesresolvers.le.acme.httpchallenge=true"
|
||||
# - "--certificatesresolvers.le.acme.httpchallenge.entrypoint=web"
|
||||
# - "--certificatesresolvers.le.acme.email=${LE_EMAIL}"
|
||||
# - "--certificatesresolvers.le.acme.storage=/acme.json"
|
||||
- "--log.level=INFO"
|
||||
ports:
|
||||
- "80:80"
|
||||
- "443:443"
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
# - ./acme.json:/acme.json:rw
|
||||
|
||||
# ------------------------------------------------------------ Celery workers
|
||||
worker-text:
|
||||
<<: *worker-base
|
||||
environment:
|
||||
<<: *worker-env
|
||||
QUEUES: "medium_embed"
|
||||
deploy: {resources: {limits: {cpus: "2", memory: 3g}}}
|
||||
|
||||
worker-photo:
|
||||
<<: *worker-base
|
||||
environment:
|
||||
<<: *worker-env
|
||||
QUEUES: "photo_embed"
|
||||
deploy: {resources: {limits: {cpus: "4", memory: 4g}}}
|
||||
|
||||
worker-ocr:
|
||||
<<: *worker-base
|
||||
environment:
|
||||
<<: *worker-env
|
||||
QUEUES: "low_ocr"
|
||||
deploy: {resources: {limits: {cpus: "4", memory: 4g}}}
|
||||
|
||||
worker-git:
|
||||
<<: *worker-base
|
||||
environment:
|
||||
<<: *worker-env
|
||||
QUEUES: "git_summary"
|
||||
deploy: {resources: {limits: {cpus: "1", memory: 1g}}}
|
||||
|
||||
worker-rss:
|
||||
<<: *worker-base
|
||||
environment:
|
||||
<<: *worker-env
|
||||
QUEUES: "rss"
|
||||
deploy: {resources: {limits: {cpus: "0.5", memory: 512m}}}
|
||||
|
||||
worker-docs:
|
||||
<<: *worker-base
|
||||
environment:
|
||||
<<: *worker-env
|
||||
QUEUES: "docs"
|
||||
deploy: {resources: {limits: {cpus: "1", memory: 1g}}}
|
||||
|
||||
# ------------------------------------------------------------ watchtower (auto-update)
|
||||
watchtower:
|
||||
image: containrrr/watchtower
|
||||
restart: unless-stopped
|
||||
command: ["--schedule", "0 0 4 * * *", "--cleanup"]
|
||||
volumes: ["/var/run/docker.sock:/var/run/docker.sock:ro"]
|
||||
networks: [kbnet]
|
||||
|
||||
# ------------------------------------------------------------------- profiles: observability (opt-in)
|
||||
# services:
|
||||
# prometheus:
|
||||
# image: prom/prometheus:v2.52
|
||||
# profiles: ["obs"]
|
||||
# networks: [kbnet]
|
||||
# volumes: [./observability/prometheus.yml:/etc/prometheus/prometheus.yml:ro]
|
||||
# restart: unless-stopped
|
||||
# ports: ["127.0.0.1:9090:9090"]
|
||||
|
||||
# grafana:
|
||||
# image: grafana/grafana:10
|
||||
# profiles: ["obs"]
|
||||
# networks: [kbnet]
|
||||
# volumes: [./observability/grafana:/var/lib/grafana]
|
||||
# restart: unless-stopped
|
||||
# environment:
|
||||
# GF_SECURITY_ADMIN_USER: admin
|
||||
# GF_SECURITY_ADMIN_PASSWORD_FILE: /run/secrets/grafana_pw
|
||||
# secrets: [grafana_pw]
|
||||
# ports: ["127.0.0.1:3000:3000"]
|
||||
|
||||
# secrets: # extra secret for Grafana, not needed otherwise
|
||||
# grafana_pw:
|
||||
# file: ./secrets/grafana_pw.txt
|
24
docker/api/Dockerfile
Normal file
24
docker/api/Dockerfile
Normal file
@ -0,0 +1,24 @@
|
||||
FROM python:3.10-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy requirements files and setup
|
||||
COPY requirements-*.txt ./
|
||||
COPY setup.py ./
|
||||
COPY src/ ./src/
|
||||
|
||||
# Install the package with API dependencies
|
||||
RUN pip install -e ".[api]"
|
||||
|
||||
# Run as non-root user
|
||||
RUN useradd -m appuser
|
||||
USER appuser
|
||||
|
||||
# Set environment variables
|
||||
ENV PORT=8000
|
||||
ENV PYTHONPATH="/app"
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the API
|
||||
CMD ["uvicorn", "memory.api.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
28
docker/workers/Dockerfile
Normal file
28
docker/workers/Dockerfile
Normal file
@ -0,0 +1,28 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy requirements files and setup
|
||||
COPY requirements-*.txt ./
|
||||
COPY setup.py ./
|
||||
COPY src/ ./src/
|
||||
|
||||
# Install dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
libpq-dev gcc && \
|
||||
pip install -e ".[workers]" && \
|
||||
apt-get purge -y gcc && apt-get autoremove -y && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create and copy entrypoint script
|
||||
COPY docker/workers/entry.sh ./entry.sh
|
||||
RUN chmod +x entry.sh
|
||||
|
||||
# Create user and set permissions
|
||||
RUN useradd -m kb && chown -R kb /app
|
||||
USER kb
|
||||
|
||||
# Default queues to process
|
||||
ENV QUEUES="medium_embed,photo_embed,low_ocr,git_summary,rss,docs"
|
||||
ENV PYTHONPATH="/app"
|
||||
|
||||
ENTRYPOINT ["./entry.sh"]
|
14
docker/workers/entry.sh
Normal file
14
docker/workers/entry.sh
Normal file
@ -0,0 +1,14 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
QUEUES=${QUEUES:-default}
|
||||
CONCURRENCY=${CONCURRENCY:-2}
|
||||
LOGLEVEL=${LOGLEVEL:-INFO}
|
||||
|
||||
HOSTNAME="${QUEUES%@*}@$(hostname)"
|
||||
|
||||
exec celery -A memory.workers.celery_app worker \
|
||||
-Q "${QUEUES}" \
|
||||
--concurrency="${CONCURRENCY}" \
|
||||
--hostname="${HOSTNAME}" \
|
||||
--loglevel="${LOGLEVEL}"
|
4
requirements-api.txt
Normal file
4
requirements-api.txt
Normal file
@ -0,0 +1,4 @@
|
||||
fastapi==0.112.2
|
||||
uvicorn==0.29.0
|
||||
python-jose==3.3.0
|
||||
python-multipart==0.0.9
|
3
requirements-common.txt
Normal file
3
requirements-common.txt
Normal file
@ -0,0 +1,3 @@
|
||||
sqlalchemy==2.0.30
|
||||
psycopg2-binary==2.9.9
|
||||
pydantic==2.7.1
|
4
requirements-workers.txt
Normal file
4
requirements-workers.txt
Normal file
@ -0,0 +1,4 @@
|
||||
celery==5.3.6
|
||||
openai==1.25.0
|
||||
pillow==10.3.0
|
||||
qdrant-client==1.9.0
|
31
setup.py
Normal file
31
setup.py
Normal file
@ -0,0 +1,31 @@
|
||||
import pathlib
|
||||
from setuptools import setup, find_namespace_packages
|
||||
|
||||
|
||||
def read_requirements(filename: str) -> list[str]:
|
||||
"""Read requirements from file, ignoring comments and -r directives."""
|
||||
filename = pathlib.Path(filename)
|
||||
return [
|
||||
line.strip()
|
||||
for line in filename.read_text().splitlines()
|
||||
if line.strip() and not line.strip().startswith(('#', '-r'))
|
||||
]
|
||||
|
||||
|
||||
# Read requirements files
|
||||
common_requires = read_requirements('requirements-common.txt')
|
||||
api_requires = read_requirements('requirements-api.txt')
|
||||
workers_requires = read_requirements('requirements-workers.txt')
|
||||
|
||||
setup(
|
||||
name="memory",
|
||||
version="0.1.0",
|
||||
package_dir={"": "src"},
|
||||
packages=find_namespace_packages(where="src"),
|
||||
python_requires=">=3.10",
|
||||
extras_require={
|
||||
"api": api_requires + common_requires,
|
||||
"workers": workers_requires + common_requires,
|
||||
"common": common_requires,
|
||||
},
|
||||
)
|
5
src/__init__.py
Normal file
5
src/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
"""
|
||||
Memory knowledge-base application.
|
||||
"""
|
||||
|
||||
__version__ = "0.1.0"
|
1
src/memory/__init__.py
Normal file
1
src/memory/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
|
0
src/memory/api/__init__.py
Normal file
0
src/memory/api/__init__.py
Normal file
50
src/memory/api/app.py
Normal file
50
src/memory/api/app.py
Normal file
@ -0,0 +1,50 @@
|
||||
"""
|
||||
FastAPI application for the knowledge base.
|
||||
"""
|
||||
from fastapi import FastAPI, Depends, HTTPException
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from memory.common.db import get_scoped_session
|
||||
from memory.common.db.models import SourceItem
|
||||
|
||||
|
||||
app = FastAPI(title="Knowledge Base API")
|
||||
|
||||
|
||||
def get_db():
|
||||
"""Database session dependency"""
|
||||
db = get_scoped_session()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health_check():
|
||||
"""Simple health check endpoint"""
|
||||
return {"status": "healthy"}
|
||||
|
||||
|
||||
@app.get("/sources")
|
||||
def list_sources(
|
||||
tag: str = None,
|
||||
limit: int = 100,
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""List source items, optionally filtered by tag"""
|
||||
query = db.query(SourceItem)
|
||||
|
||||
if tag:
|
||||
query = query.filter(SourceItem.tags.contains([tag]))
|
||||
|
||||
return query.limit(limit).all()
|
||||
|
||||
|
||||
@app.get("/sources/{source_id}")
|
||||
def get_source(source_id: int, db: Session = Depends(get_db)):
|
||||
"""Get a specific source by ID"""
|
||||
source = db.query(SourceItem).filter(SourceItem.id == source_id).first()
|
||||
if not source:
|
||||
raise HTTPException(status_code=404, detail="Source not found")
|
||||
return source
|
0
src/memory/common/__init__.py
Normal file
0
src/memory/common/__init__.py
Normal file
12
src/memory/common/db/__init__.py
Normal file
12
src/memory/common/db/__init__.py
Normal file
@ -0,0 +1,12 @@
|
||||
"""
|
||||
Database utilities package.
|
||||
"""
|
||||
from memory.common.db.models import Base
|
||||
from memory.common.db.connection import get_engine, get_session_factory, get_scoped_session
|
||||
|
||||
__all__ = [
|
||||
"Base",
|
||||
"get_engine",
|
||||
"get_session_factory",
|
||||
"get_scoped_session",
|
||||
]
|
32
src/memory/common/db/connection.py
Normal file
32
src/memory/common/db/connection.py
Normal file
@ -0,0 +1,32 @@
|
||||
"""
|
||||
Database connection utilities.
|
||||
"""
|
||||
import os
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker, scoped_session
|
||||
|
||||
|
||||
def get_engine():
|
||||
"""Create SQLAlchemy engine from environment variables"""
|
||||
user = os.getenv("POSTGRES_USER", "kb")
|
||||
password = os.getenv("POSTGRES_PASSWORD", "kb")
|
||||
host = os.getenv("POSTGRES_HOST", "postgres")
|
||||
port = os.getenv("POSTGRES_PORT", "5432")
|
||||
db = os.getenv("POSTGRES_DB", "kb")
|
||||
|
||||
return create_engine(f"postgresql://{user}:{password}@{host}:{port}/{db}")
|
||||
|
||||
|
||||
def get_session_factory():
|
||||
"""Create a session factory for SQLAlchemy sessions"""
|
||||
engine = get_engine()
|
||||
session_factory = sessionmaker(bind=engine)
|
||||
return session_factory
|
||||
|
||||
|
||||
def get_scoped_session():
|
||||
"""Create a thread-local scoped session factory"""
|
||||
engine = get_engine()
|
||||
session_factory = sessionmaker(bind=engine)
|
||||
return scoped_session(session_factory)
|
127
src/memory/common/db/models.py
Normal file
127
src/memory/common/db/models.py
Normal file
@ -0,0 +1,127 @@
|
||||
"""
|
||||
Database models for the knowledge base system.
|
||||
"""
|
||||
from sqlalchemy import (
|
||||
Column, ForeignKey, Integer, BigInteger, Text, DateTime, Boolean, Float,
|
||||
ARRAY, func
|
||||
)
|
||||
from sqlalchemy.dialects.postgresql import BYTEA, JSONB, TSVECTOR
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
class SourceItem(Base):
|
||||
__tablename__ = 'source_item'
|
||||
|
||||
id = Column(BigInteger, primary_key=True)
|
||||
modality = Column(Text, nullable=False)
|
||||
sha256 = Column(BYTEA, nullable=False, unique=True)
|
||||
inserted_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
tags = Column(ARRAY(Text), nullable=False, server_default='{}')
|
||||
lang = Column(Text)
|
||||
model_hash = Column(Text)
|
||||
vector_ids = Column(ARRAY(Text), nullable=False, server_default='{}')
|
||||
embed_status = Column(Text, nullable=False, server_default='RAW')
|
||||
byte_length = Column(Integer)
|
||||
mime_type = Column(Text)
|
||||
|
||||
|
||||
class MailMessage(Base):
|
||||
__tablename__ = 'mail_message'
|
||||
|
||||
id = Column(BigInteger, primary_key=True)
|
||||
source_id = Column(BigInteger, ForeignKey('source_item.id', ondelete='CASCADE'), nullable=False)
|
||||
message_id = Column(Text, unique=True)
|
||||
subject = Column(Text)
|
||||
sender = Column(Text)
|
||||
recipients = Column(ARRAY(Text))
|
||||
sent_at = Column(DateTime(timezone=True))
|
||||
body_raw = Column(Text)
|
||||
attachments = Column(JSONB)
|
||||
tsv = Column(TSVECTOR)
|
||||
|
||||
|
||||
class ChatMessage(Base):
|
||||
__tablename__ = 'chat_message'
|
||||
|
||||
id = Column(BigInteger, primary_key=True)
|
||||
source_id = Column(BigInteger, ForeignKey('source_item.id', ondelete='CASCADE'), nullable=False)
|
||||
platform = Column(Text)
|
||||
channel_id = Column(Text)
|
||||
author = Column(Text)
|
||||
sent_at = Column(DateTime(timezone=True))
|
||||
body_raw = Column(Text)
|
||||
|
||||
|
||||
class GitCommit(Base):
|
||||
__tablename__ = 'git_commit'
|
||||
|
||||
id = Column(BigInteger, primary_key=True)
|
||||
source_id = Column(BigInteger, ForeignKey('source_item.id', ondelete='CASCADE'), nullable=False)
|
||||
repo_path = Column(Text)
|
||||
commit_sha = Column(Text, unique=True)
|
||||
author_name = Column(Text)
|
||||
author_email = Column(Text)
|
||||
author_date = Column(DateTime(timezone=True))
|
||||
msg_raw = Column(Text)
|
||||
diff_summary = Column(Text)
|
||||
files_changed = Column(ARRAY(Text))
|
||||
|
||||
|
||||
class Photo(Base):
|
||||
__tablename__ = 'photo'
|
||||
|
||||
id = Column(BigInteger, primary_key=True)
|
||||
source_id = Column(BigInteger, ForeignKey('source_item.id', ondelete='CASCADE'), nullable=False)
|
||||
file_path = Column(Text)
|
||||
exif_taken_at = Column(DateTime(timezone=True))
|
||||
exif_lat = Column(Float)
|
||||
exif_lon = Column(Float)
|
||||
camera_make = Column(Text)
|
||||
camera_model = Column(Text)
|
||||
|
||||
|
||||
class BookDoc(Base):
|
||||
__tablename__ = 'book_doc'
|
||||
|
||||
id = Column(BigInteger, primary_key=True)
|
||||
source_id = Column(BigInteger, ForeignKey('source_item.id', ondelete='CASCADE'), nullable=False)
|
||||
title = Column(Text)
|
||||
author = Column(Text)
|
||||
chapter = Column(Text)
|
||||
published = Column(DateTime)
|
||||
|
||||
|
||||
class BlogPost(Base):
|
||||
__tablename__ = 'blog_post'
|
||||
|
||||
id = Column(BigInteger, primary_key=True)
|
||||
source_id = Column(BigInteger, ForeignKey('source_item.id', ondelete='CASCADE'), nullable=False)
|
||||
url = Column(Text, unique=True)
|
||||
title = Column(Text)
|
||||
published = Column(DateTime(timezone=True))
|
||||
|
||||
|
||||
class MiscDoc(Base):
|
||||
__tablename__ = 'misc_doc'
|
||||
|
||||
id = Column(BigInteger, primary_key=True)
|
||||
source_id = Column(BigInteger, ForeignKey('source_item.id', ondelete='CASCADE'), nullable=False)
|
||||
path = Column(Text)
|
||||
mime_type = Column(Text)
|
||||
|
||||
|
||||
class RssFeed(Base):
|
||||
__tablename__ = 'rss_feeds'
|
||||
|
||||
id = Column(BigInteger, primary_key=True)
|
||||
url = Column(Text, nullable=False, unique=True)
|
||||
title = Column(Text)
|
||||
description = Column(Text)
|
||||
tags = Column(ARRAY(Text), nullable=False, server_default='{}')
|
||||
last_checked_at = Column(DateTime(timezone=True))
|
||||
active = Column(Boolean, nullable=False, server_default='true')
|
||||
created_at = Column(DateTime(timezone=True), nullable=False, server_default=func.now())
|
||||
updated_at = Column(DateTime(timezone=True), nullable=False, server_default=func.now())
|
0
src/memory/workers/__init__.py
Normal file
0
src/memory/workers/__init__.py
Normal file
33
src/memory/workers/celery_app.py
Normal file
33
src/memory/workers/celery_app.py
Normal file
@ -0,0 +1,33 @@
|
||||
import os
|
||||
from celery import Celery
|
||||
|
||||
|
||||
def rabbit_url() -> str:
|
||||
user = os.getenv("RABBITMQ_USER", "guest")
|
||||
password = os.getenv("RABBITMQ_PASSWORD", "guest")
|
||||
return f"amqp://{user}:{password}@rabbitmq:5672//"
|
||||
|
||||
|
||||
app = Celery("memory",
|
||||
broker=rabbit_url(),
|
||||
backend=os.getenv("CELERY_RESULT_BACKEND",
|
||||
"db+postgresql://kb:kb@postgres/kb"))
|
||||
|
||||
|
||||
app.autodiscover_tasks(["memory.workers.tasks"])
|
||||
|
||||
|
||||
app.conf.update(
|
||||
task_acks_late=True,
|
||||
task_reject_on_worker_lost=True,
|
||||
worker_prefetch_multiplier=1,
|
||||
task_routes={
|
||||
# Task routing configuration
|
||||
"memory.workers.tasks.text.*": {"queue": "medium_embed"},
|
||||
"memory.workers.tasks.photo.*": {"queue": "photo_embed"},
|
||||
"memory.workers.tasks.ocr.*": {"queue": "low_ocr"},
|
||||
"memory.workers.tasks.git.*": {"queue": "git_summary"},
|
||||
"memory.workers.tasks.rss.*": {"queue": "rss"},
|
||||
"memory.workers.tasks.docs.*": {"queue": "docs"},
|
||||
},
|
||||
)
|
4
src/memory/workers/tasks/__init__.py
Normal file
4
src/memory/workers/tasks/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
"""
|
||||
Import sub-modules so Celery can register their @app.task decorators.
|
||||
"""
|
||||
from memory.workers.tasks import text, photo, ocr, git, rss, docs # noqa
|
5
src/memory/workers/tasks/docs.py
Normal file
5
src/memory/workers/tasks/docs.py
Normal file
@ -0,0 +1,5 @@
|
||||
from memory.workers.celery_app import app
|
||||
|
||||
@app.task(name="kb.text.ping")
|
||||
def ping():
|
||||
return "pong"
|
5
src/memory/workers/tasks/git.py
Normal file
5
src/memory/workers/tasks/git.py
Normal file
@ -0,0 +1,5 @@
|
||||
from memory.workers.celery_app import app
|
||||
|
||||
@app.task(name="kb.text.ping")
|
||||
def ping():
|
||||
return "pong"
|
5
src/memory/workers/tasks/ocr.py
Normal file
5
src/memory/workers/tasks/ocr.py
Normal file
@ -0,0 +1,5 @@
|
||||
from memory.workers.celery_app import app
|
||||
|
||||
@app.task(name="kb.text.ping")
|
||||
def ping():
|
||||
return "pong"
|
5
src/memory/workers/tasks/photo.py
Normal file
5
src/memory/workers/tasks/photo.py
Normal file
@ -0,0 +1,5 @@
|
||||
from memory.workers.celery_app import app
|
||||
|
||||
@app.task(name="kb.text.ping")
|
||||
def ping():
|
||||
return "pong"
|
6
src/memory/workers/tasks/rss.py
Normal file
6
src/memory/workers/tasks/rss.py
Normal file
@ -0,0 +1,6 @@
|
||||
from memory.workers.celery_app import app
|
||||
|
||||
|
||||
@app.task(name="kb.text.ping")
|
||||
def ping():
|
||||
return "pong"
|
5
src/memory/workers/tasks/text.py
Normal file
5
src/memory/workers/tasks/text.py
Normal file
@ -0,0 +1,5 @@
|
||||
from memory.workers.celery_app import app
|
||||
|
||||
@app.task(name="memory.text.ping")
|
||||
def ping():
|
||||
return "pong"
|
1
tests/__init__.py
Normal file
1
tests/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
|
Loading…
x
Reference in New Issue
Block a user