commit a003ada9b75c729204f4650bd69b452487f04217 Author: Daniel O'Connell Date: Sun Apr 27 14:31:53 2025 +0200 initial structure diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..28c5b36 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.env +.DS_Store +secrets/ +acme.json +__pycache__/ \ No newline at end of file diff --git a/db/schema.sql b/db/schema.sql new file mode 100644 index 0000000..961a776 --- /dev/null +++ b/db/schema.sql @@ -0,0 +1,229 @@ +/*======================================================================== + Knowledge-Base schema – first-run script + --------------------------------------------------------------- + • PostgreSQL 15+ + • Creates every table, index, trigger and helper in one pass + • No ALTER statements or later migrations required + • Enable pgcrypto for UUID helpers (safe to re-run) + ========================================================================*/ + +------------------------------------------------------------------------------- +-- 0. EXTENSIONS +------------------------------------------------------------------------------- +CREATE EXTENSION IF NOT EXISTS pgcrypto; -- gen_random_uuid(), crypt() + +------------------------------------------------------------------------------- +-- 1. CANONICAL ARTEFACT TABLE (everything points here) +------------------------------------------------------------------------------- +CREATE TABLE source_item ( + id BIGSERIAL PRIMARY KEY, + modality TEXT NOT NULL, -- 'mail'|'chat'|... + sha256 BYTEA UNIQUE NOT NULL, -- 32-byte blob + inserted_at TIMESTAMPTZ DEFAULT NOW(), + tags TEXT[] NOT NULL DEFAULT '{}', -- flexible labels + lang TEXT, -- ISO-639-1 or NULL + model_hash TEXT, -- embedding model ver. + vector_ids TEXT[] NOT NULL DEFAULT '{}', -- 0-N Qdrant IDs + embed_status TEXT NOT NULL DEFAULT 'RAW' + CHECK (embed_status IN ('RAW','QUEUED','STORED','FAILED')), + byte_length INTEGER, -- original size + mime_type TEXT +); + +CREATE INDEX source_modality_idx ON source_item (modality); +CREATE INDEX source_status_idx ON source_item (embed_status); +CREATE INDEX source_tags_idx ON source_item USING GIN (tags); + +-- 1.a Trigger – vector_ids must be present when status = STORED +CREATE OR REPLACE FUNCTION trg_vector_ids_not_empty() +RETURNS TRIGGER LANGUAGE plpgsql AS $$ +BEGIN + IF NEW.embed_status = 'STORED' + AND (NEW.vector_ids IS NULL OR array_length(NEW.vector_ids,1) = 0) THEN + RAISE EXCEPTION + USING MESSAGE = 'vector_ids must not be empty when embed_status = STORED'; + END IF; + RETURN NEW; +END; +$$; +CREATE TRIGGER check_vector_ids +BEFORE UPDATE ON source_item +FOR EACH ROW EXECUTE FUNCTION trg_vector_ids_not_empty(); + +------------------------------------------------------------------------------- +-- 2. MAIL MESSAGES +------------------------------------------------------------------------------- +CREATE TABLE mail_message ( + id BIGSERIAL PRIMARY KEY, + source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE, + message_id TEXT UNIQUE, + subject TEXT, + sender TEXT, + recipients TEXT[], + sent_at TIMESTAMPTZ, + body_raw TEXT, + attachments JSONB +); + +CREATE INDEX mail_sent_idx ON mail_message (sent_at); +CREATE INDEX mail_recipients_idx ON mail_message USING GIN (recipients); + +ALTER TABLE mail_message + ADD COLUMN tsv tsvector + GENERATED ALWAYS AS ( + to_tsvector('english', + coalesce(subject,'') || ' ' || coalesce(body_raw,''))) + STORED; +CREATE INDEX mail_tsv_idx ON mail_message USING GIN (tsv); + +------------------------------------------------------------------------------- +-- 3. CHAT (Slack / Discord) +------------------------------------------------------------------------------- +CREATE TABLE chat_message ( + id BIGSERIAL PRIMARY KEY, + source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE, + platform TEXT CHECK (platform IN ('slack','discord')), + channel_id TEXT, + author TEXT, + sent_at TIMESTAMPTZ, + body_raw TEXT +); +CREATE INDEX chat_channel_idx ON chat_message (platform, channel_id); + +------------------------------------------------------------------------------- +-- 4. GIT COMMITS (local repos) +------------------------------------------------------------------------------- +CREATE TABLE git_commit ( + id BIGSERIAL PRIMARY KEY, + source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE, + repo_path TEXT, + commit_sha TEXT UNIQUE, + author_name TEXT, + author_email TEXT, + author_date TIMESTAMPTZ, + msg_raw TEXT, + diff_summary TEXT, + files_changed TEXT[] +); +CREATE INDEX git_files_idx ON git_commit USING GIN (files_changed); +CREATE INDEX git_date_idx ON git_commit (author_date); + +------------------------------------------------------------------------------- +-- 5. PHOTOS +------------------------------------------------------------------------------- +CREATE TABLE photo ( + id BIGSERIAL PRIMARY KEY, + source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE, + file_path TEXT, + exif_taken_at TIMESTAMPTZ, + exif_lat NUMERIC(9,6), + exif_lon NUMERIC(9,6), + camera_make TEXT, + camera_model TEXT +); +CREATE INDEX photo_taken_idx ON photo (exif_taken_at); + +------------------------------------------------------------------------------- +-- 6. BOOKS, BLOG POSTS, MISC DOCS +------------------------------------------------------------------------------- +CREATE TABLE book_doc ( + id BIGSERIAL PRIMARY KEY, + source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE, + title TEXT, + author TEXT, + chapter TEXT, + published DATE +); + +CREATE TABLE blog_post ( + id BIGSERIAL PRIMARY KEY, + source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE, + url TEXT UNIQUE, + title TEXT, + published TIMESTAMPTZ +); + +CREATE TABLE misc_doc ( + id BIGSERIAL PRIMARY KEY, + source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE, + path TEXT, + mime_type TEXT +); + +------------------------------------------------------------------------------- +-- 6.5 RSS FEEDS +------------------------------------------------------------------------------- +CREATE TABLE rss_feeds ( + id BIGSERIAL PRIMARY KEY, + url TEXT UNIQUE NOT NULL, + title TEXT, + description TEXT, + tags TEXT[] NOT NULL DEFAULT '{}', + last_checked_at TIMESTAMPTZ, + active BOOLEAN NOT NULL DEFAULT TRUE, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX rss_feeds_active_idx ON rss_feeds (active, last_checked_at); +CREATE INDEX rss_feeds_tags_idx ON rss_feeds USING GIN (tags); + +------------------------------------------------------------------------------- +-- 7. GITHUB ITEMS (issues, PRs, comments, project cards) +------------------------------------------------------------------------------- +CREATE TYPE gh_item_kind AS ENUM ('issue','pr','comment','project_card'); + +CREATE TABLE github_item ( + id BIGSERIAL PRIMARY KEY, + source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE, + + kind gh_item_kind NOT NULL, + repo_path TEXT NOT NULL, -- "owner/repo" + number INTEGER, -- issue/PR number (NULL for commit comment) + parent_number INTEGER, -- comment → its issue/PR + commit_sha TEXT, -- for commit comments + state TEXT, -- 'open'|'closed'|'merged' + title TEXT, + body_raw TEXT, + labels TEXT[], + author TEXT, + created_at TIMESTAMPTZ, + closed_at TIMESTAMPTZ, + merged_at TIMESTAMPTZ, + diff_summary TEXT, -- PR only + + payload JSONB -- extra GitHub fields +); + +CREATE INDEX gh_repo_kind_idx ON github_item (repo_path, kind); +CREATE INDEX gh_issue_lookup_idx ON github_item (repo_path, kind, number); +CREATE INDEX gh_labels_idx ON github_item USING GIN (labels); + +CREATE INDEX gh_tsv_idx ON github_item +WHERE kind IN ('issue','pr') +USING GIN (to_tsvector('english', + coalesce(title,'') || ' ' || coalesce(body_raw,''))); + +------------------------------------------------------------------------------- +-- 8. HELPER FUNCTION – add tags +------------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION add_tags(p_source BIGINT, p_tags TEXT[]) +RETURNS VOID LANGUAGE SQL AS $$ +UPDATE source_item + SET tags = + (SELECT ARRAY(SELECT DISTINCT unnest(tags || p_tags))) + WHERE id = p_source; +$$; + +------------------------------------------------------------------------------- +-- 9. (optional) PARTITION STUBS – create per-year partitions later +------------------------------------------------------------------------------- +/* +-- example: +CREATE TABLE mail_message_2026 PARTITION OF mail_message + FOR VALUES FROM ('2026-01-01') TO ('2027-01-01'); +*/ + +-- ========================================================================= +-- Schema creation complete +-- ========================================================================= \ No newline at end of file diff --git a/design/plan.md b/design/plan.md new file mode 100644 index 0000000..8c6bdfc --- /dev/null +++ b/design/plan.md @@ -0,0 +1,401 @@ +# Personal Multimodal Knowledge-Base — Design Document v1.0 +*(self-hosted, privacy-first)* + +--- + +## 1 Purpose + +Build a single private system that lets **one user** ask natural-language questions (text or image) and instantly search: + +* E-mail (IMAP or local mbox) +* Slack & Discord messages +* Git commit history +* Photos (≈ 200 GB) +* Books (EPUB/PDF), important blog posts / RSS items +* Misc. documents & meeting transcripts + +…and receive answers that combine the most relevant passages, pictures and commits. + +--- + +## 2 High-level Architecture + +``` + ┌──────────────────────────┐ + Internet → │ Ingestion Workers │ + │ (Celery queues) │ + │ mail / chat / git / ... │ + └─────┬──────────┬─────────┘ + │TEXT │IMAGE/ATT + ▼ ▼ + ┌──────────┐ ┌──────────┐ + │Embedding │ │OCR / │ + │Workers │ │Vision │ + └────┬──────┘ └────┬─────┘ + │ vectors │ captions/tags + ▼ ▼ + Postgres 15 (canonical) ◄───► Qdrant 1.9 (vectors) + • raw bodies / metadata • per-modality collections + • tags[] array, GIN index • payload filter inde# Personal Multimodal Knowledge-Base — **Design Document v1.1** + +*(self-hosted, privacy-first; incorporates external feedback except the “LLaVA-speed” concern, which is intentionally ignored)* + +--- + +## 1 Purpose + +Provide a **single-user** system that answers natural-language questions about the owner’s entire digital corpus—e-mails, chats, code history, photos, books, blog posts, RSS items and ad-hoc documents—while keeping all data fully under personal control. + +--- + +## 2 Target Workload & Service-Levels + +| Metric | Year-1 | 5-year | +|--------|--------|--------| +| Text artefacts | ≈ 5 M | ≈ 25 M | +| Photos | ≈ 200 k (≈ 200 GB) | ≈ 600 k (≈ 600 GB) | +| Concurrency | 1 interactive seat + background jobs | +| **p95 answer latency** | ≤ 2 s (GPT-4o) | +| Uptime goal | “Home-lab” best-effort, but automatic recovery from single-component failures | + +--- + +## 3 Hardware Specification ‼ BREAKING + +| Component | Spec | Notes | +|-----------|------|-------| +| CPU | 8-core / 16-thread (NUC 13 Pro i7 or similar) | +| **RAM** | **32 GB ECC** | +| **GPU** | **Low-profile RTX A2000 (6 GB)** — accelerates CLIP & local LLaVA | +| Storage | 2 TB NVMe (data) + 2 TB SATA SSD (offline backup/ZFS snapshot target) | +| Power | ≈ 10 W idle, 55 W peak | + +--- + +## 4 Software Stack + +| Layer | Tech | +|-------|------| +| OS | Ubuntu 22.04 LTS (automatic security updates) | +| Container runtime | Docker 24 + docker-compose v2 | +| **Message broker** | RabbitMQ 3.13 (priority queues, DLQ) | +| Database | PostgreSQL 15 | +| Vector DB | Qdrant 1.9 | +| Task engine | Celery 5 (broker = RabbitMQ, result-backend = Postgres) | +| Web/API | FastAPI + Uvicorn | +| Back-end LLMs | GPT-4o (API) **and** optional on-device LLaVA-1.6-Q4 (GPU) | +| Embeddings | OpenAI *text-embedding-3-small* (1536 d) • OpenCLIP ViT-B/32 (512 d) | + +--- + +## 5 Data Sources & Ingestion Queues + +| Source | Trigger | Parser | Default **tags[]** | +|--------|---------|--------|--------------------| +| **E-mail IMAP** | UID poll 10 min | `imap_tools` | `work` if address ends `@corp.com` | +| **Slack** | Socket-mode WS | `slack_sdk` | `work` on `#proj-*` | +| **Discord** | Gateway WS | `discord.py` | `personal` | +| **Git** | `post-receive` hook / hourly fetch | `GitPython` + LLM diff summary | `work` if remote host in allow-list | +| **Photos** | `watchdog` folder | `Pillow`, EXIF; CLIP embed; FaceNet | `personal` unless GPS in office polygon | +| **Books (EPUB/PDF)** | Nightly folder scan | `ebooklib`, `pdfminer`, OCR | `reference` | +| **Blog / RSS** | `feedparser` 30 min | `trafilatura` | `reference` | +| **Misc docs / transcripts** | `watchdog` inbox | PDF→OCR, DOCX→txt, VTT→txt | deduced from path | + +--- + +## 6 Data Model + +### 6.1 PostgreSQL (tables share columns) + +```sql +id bigserial primary key, +sha256 bytea unique, +inserted_at timestamptz default now(), +tags text[] not null default '{}', -- flexible labelling +lang text, -- detected language +body_raw text, -- TOAST/LZ4 +vector_ids text[], -- 0-N vectors in Qdrant +model_hash text -- hash of embedding model +``` + +*GIN index on `tags`; range or JSONB indexes where relevant.* + +### 6.2 Qdrant (collections) + +| Collection | Model | Dim | +|------------|-------|-----| +| `mail`, `chat`, `git`, `book`, `blog`, `doc` | *text-embedding-3-small* | 1536 | +| `photo` | OpenCLIP ViT-B/32 | 512 | + +Payload fields: `tags`, per-domain metadata (EXIF, author, files_changed[] …). + +--- + +## 7 Task Queues & Concurrency + +| Celery queue | Priority | Concurrency | Typical load | +|--------------|----------|-------------|--------------| +| `interactive` | 9 | auto (1 per core) | query embedding + GPT-4o calls | +| `medium_embed` | 5 | 4 | mail/chat embeddings | +| `low_ocr` | 2 | **≤ physical cores – 2** | PDF/image OCR | +| `photo_embed_gpu` | 5 | GPU | CLIP image vectors | +| `git_summary` | 4 | 2 | LLM diff summaries | +| All queues have DLQ → `failed_tasks` exchange (RabbitMQ). | + +--- + +## 8 Vector Consistency & Repair + +* **Up-front write:** worker inserts into Postgres, then Qdrant; the returned `vector_id` is stored in `vector_ids[]`. +* **Audit Cron (5 min):** + * Find rows where `vector_ids = '{}'` or with `model_hash ≠ CURRENT_HASH`. + * Re-enqueue to appropriate embed queue. +* **Qdrant-centric diff (hourly):** dump collection IDs → compare against Postgres; orphans are deleted, missing vectors are re-enqueued. +* **Disaster Re-build:** documented script streams `id,chunk_text` to embed queue (rate-limited). + +--- + +## 9 Embedding-Model Versioning + +* Compute `MODEL_HASH = sha256(model_name + version + weights_SHA)` at worker start. +* Model change → hashes differ → audit cron flags rows → background re-embed queue. +* Router refuses to mix hashes unless `ALLOW_MIXED_MODE=1`. + +--- + +## 10 Security Hardening + +1. **JWT auth** on all API routes (HS256 secret in Docker secret store). +2. **Rate limiter** (`slowapi`): 60 req / min / IP. +3. **Filesystem isolation** + * Containers run as UID 1000, read-only bind mounts for `/photos`, `/books`. +4. **TLS everywhere** (Traefik + Let’s Encrypt on LAN or Tailscale certs). +5. **Input sanitisation**: Markdown-escape bodies; regex filter for SSNs/credit-card patterns before LLM prompt. +6. **Resource quotas** in compose (`mem_limit`, `pids_limit`). + +--- + +## 11 Backup & Restore + +| Layer | Tool | Frequency | Storage cost (Glacier DA) | +|-------|------|-----------|---------------------------| +| Postgres basebackup + WAL | `pgBackRest` | nightly | included in dataset | +| Qdrant | `qdrant-backup` tar of collection dir | nightly | vectors + graph ≈ 20 GB year-5 | +| Files / attachments | Restic dedup | nightly | 400 GB -> ~€1.2 / mo | +| **Grandfather-father-son** pruning (`7-4-6`). | + +Restore script: ① create fresh volumes, ② `pgbackrest restore`, ③ `qdrant-restore`, ④ run audit cron to verify. + +--- + +## 12 Monitoring & Alerting + +* **Prometheus exporters** + * node-exporter, postgres-exporter, rabbitmq-exporter, qdrant-exporter, cadvisor. +* **Grafana dashboards**: CPU, RAM, queue depth, DLQ count, GPT-4o latency. +* **Alertmanager rules** + * `vector_audit_missing > 500` → warn + * `node_filesystem_free_percent < 15` → critical + * `rabbitmq_queue_messages{queue="failed_tasks"} > 0` → critical + * `pg_up == 0` → critical + +--- + +## 13 Query Flow + +1. **Embed** user text with *text-embedding-3* and CLIP-text (one call each). +2. **Determine scope** from conversation memory (`tags = 'work'` etc.). +3. **Async search** each relevant collection (max 3 per batch) with payload filter. +4. **Merge** top-k by score. +5. Build **evidence JSON** (snippets, thumbnails, commit summaries). +6. **LLM** + * default: GPT-4o (vision) via API + * offline mode: local LLaVA-1.6 Q4 on GPU +7. Stream answer + thumbnails. +*Expected p95 latency on spec hardware: **~2 s** (cloud) | **~1.4 s** (local LLaVA).* + +--- + +## 14 Internationalisation + +* Tesseract language packs specified via `OCR_LANGS=eng+pol+deu`. +* `langdetect` sets `lang` column; router boosts same-language chunks. + +--- + +## 15 Road-map + +| Phase | Milestones | +|-------|------------| +| **0** | Hardware build, RabbitMQ, base compose up | +| **1** | Mail, chat, photo, git ingestion & audit loop | +| **2** | Backup scripts, security hardening, monitoring | +| **3** | Books/RSS/misc docs + international OCR | +| **4** | Tag-based multi-user RLS (optional) | + +--- + +*End of Design Document v1.1*x + • LISTEN/NOTIFY queue + ▲ ▲ + │ │ + └────────────┬─────────────────────┘ + │ + FastAPI “/chat” + (router + merge) + + LangChain agent + + GPT-4o or local LLaVA +``` + +*Everything runs in Docker-Compose on a low-power x86 mini-PC (NUC 11/12; 16 GB RAM, 1 TB NVMe).* + +--- + +## 3 Data Sources & Ingestion + +| Source | Trigger | Parser / Notes | Stored **tags** (default rules) | +|--------|---------|----------------|---------------------------------| +| **E-mail** (IMAP, mbox) | UID poll 10 min | `imap_tools`, strip quotes | `work` if address ends “@corp.com” | +| **Slack** | Socket-mode WS | `slack_sdk`, flatten blocks | `work` if channel `#proj-*` | +| **Discord** | Gateway WS | `discord.py`, role IDs | `personal` else | +| **Git commits** | `post-receive` hook or hourly fetch | `GitPython` → diff; 3-sentence summary via LLM | `work` if remote in `github.com/corp` | +| **Photos** | `watchdog` on folder | `Pillow`, EXIF; CLIP embed; FaceNet & optional YOLO tagger | `personal` unless GPS inside office | +| **Books** (EPUB/PDF) | Nightly scan of `/books` | `ebooklib` / `pdfminer` (+OCR) | `reference` | +| **Blog / RSS** | `feedparser` every 30 min | `trafilatura` HTML clean | `reference` | +| **Misc. docs / transcripts** | `watchdog` on `/kb-inbox` | PDF->OCR, DOCX→txt, VTT/SRT stitch | inferred from path (`/work/` etc.) | + +--- + +## 4 Storage Model + +### 4.1 PostgreSQL (system-of-record) + +* Base tables: `mail_msg`, `chat_msg`, `git_commit`, `photo`, `book_doc`, `blog_post`, `misc_doc`, `attachment`. +* Common columns: `id bigserial`, `sha256 bytea`, `inserted_at timestamptz`, `tags text[] NOT NULL DEFAULT '{}'`, `vector_ids text[]`. +* All large bodies are **TOAST/LZ4** compressed; photos/attachments > 5 MB stay on disk with a path pointer. +* GIN indexes on `tags` for millisecond filtering. +* LISTEN/NOTIFY drives Celery (no Redis needed, but Redis used by default). + +### 4.2 Qdrant (similarity index) + +| Collection | Model | Dim | Distance | Extra payload | +|------------|-------|-----|----------|---------------| +| `mail` | `text-embedding-3-small` | 1536 | Cosine | `tags`, `folder`, `from` | +| `chat` | same | 1536 | Cosine | `channel_id`, `platform` | +| `git` | same | 1536 | Cosine | `files_changed[]`, `author`, `tags` | +| `photo` | OpenCLIP ViT-B/32 | 512 | Cosine | `exif_date`, `face_id`, `tags` | +| `book`, `blog`, `doc` | same | 1536 | Cosine | `title`, `source_url`, `tags` | + +--- + +## 5 Workers & Queues + +| Queue | Concurrency | Task | Key libs | +|-------|-------------|------|----------| +| `text` | 4 CPU | Chunk + embed text | OpenAI Python SDK | +| `image` | 2 CPU / GPU | Embed photo (CLIP) | `open_clip_torch` | +| `ocr` | 8 CPU | OCR PDF/image | `ocrmypdf`, `tesseract-ocr` | +| `git` | 2 CPU | Diff-summary → embed | GPT-4o mini or Φ-3-mini | +| `rss` | 1 CPU | Fetch feed, parse article | `feedparser`, `trafilatura` | +| `docs` | 2 CPU | Misc file parsing | `pdfminer`, `python-docx` | + +Every queue auto-retries 3× with exponential back-off. + +--- + +## 6 Tagging Framework + +* YAML rule file; fields `sender_regex`, `path_regex`, `channel_regex`, `gps_polygon`, `add_tags[]`. +* Workers call `apply_tags()` before inserting into Postgres/Qdrant. +* CLI utility `retag add/remove (--where …)` for bulk fixes. +* Tags are free-form strings; new tags require **no schema or index change** — Qdrant builds bitmap on first use. + +--- + +## 7 Query & Chat Flow + +1. **Router** embeds user text with + * CLIP-text → hits `photo` + * text-embed-3 → hits all text collections. +2. Applies user-or conversation-scoped filter, e.g. `{"tags":{"value":"work"}}`. +3. Parallel search (async) → merge top-k by score. +4. Build “evidence bundle” (snippets, thumbs, commit msgs). +5. Feed bundle + question to LLM: + * cloud GPT-4o (vision) **or** + * local LLaVA-1.6 + captions. +6. Stream answer & thumbnails back. + +Expected latency (NUC, GPT-4o): **≈ 1.3 s p95**. + +--- + +## 8 Back-ups & DR + +| Layer | Method | Retention | Cost | +|-------|--------|-----------|------| +| NVMe dataset | Restic dedup ⇒ **S3 Glacier Deep Archive** | 7 daily / 4 weekly / 6 monthly | First snapshot 250 GB → €0.9 / mo; delta ≈ €0.02 / mo | +| Local roll-back | ZFS hourly snapshots (compressed) | 7 days | disk-only | +| Restore test | Quarterly scripted restore to `/tmp/restore-test` | — | — | + +--- + +## 9 Security + +* Full-disk LUKS; if on AWS use encrypted EBS + **customer-managed KMS key**. +* Instance in private subnet; access via Tailscale SSH or AWS SSM. +* Docker containers run as non-root; seccomp default profile. +* TLS termination in Traefik with auto-renewing Let’s Encrypt cert on LAN. + +--- + +## 10 Hardware & Performance + +| Component | Spec | Head-room | +|-----------|------|-----------| +| Mini-PC | 4-core i5 (11th gen) / 16 GB RAM | p95 memory < 9 GB | +| Storage | 1 TB NVMe + ext. 1 TB SATA for backups | 5-year growth ≤ 400 GB | +| Power | 6 W idle → €1.5 / mo | — | +| GPU (optional) | Used RTX 2060 / T600 | Speeds 1st photo embed to < 1 h | + +--- + +## 11 LLM & Model Abstraction + +```python +class EmbedProvider(ABC): + def embed(self, text: str) -> list[float]: ... + +provider = OpenAIProvider(model="text-embedding-3-small") +# swap later: +# provider = OllamaProvider(model="nomic-embed-text") + +# injection via environment +EMBED_BACKEND="openai" # or "ollama" +``` + +Same interface for diff-summariser and chat-LLM; switching is one `docker-compose.yml` env var. + +--- + +## 12 Monitoring & Ops + +* **Prometheus + Grafana**: node load, Postgres WAL lag, queue depth. +* **Watchtower** auto-updates images weekly (except Postgres & Qdrant). +* Alertmanager e-mails if free disk < 15 % or any Celery worker dies. + +--- + +## 13 Roadmap / Open Items + +| Phase | Deliverable | +|-------|-------------| +| **0** (done) | Design document v1.0 | +| **1** | Dockerfiles & compose stack; mail + chat + photo ingestion | +| **2** | Git summariser + OCR worker; tag rules config | +| **3** | Books, RSS, misc docs workers | +| **4** | Live chat UI & LLaVA offline option | +| **5** | Multi-user RLS & optional code-search add-on | + +--- + +*Document ends — save for future implementation.* \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..d59bd76 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,239 @@ +version: "3.9" + +# --------------------------------------------------------------------- networks +networks: + kbnet: # internal overlay – NOT exposed + driver: bridge + +# --------------------------------------------------------------------- secrets +secrets: + postgres_password: {file: ./secrets/postgres_password.txt} + jwt_secret: {file: ./secrets/jwt_secret.txt} + openai_key: {file: ./secrets/openai_key.txt} + +# --------------------------------------------------------------------- volumes +volumes: + db_data: {} # Postgres + qdrant_data: {} # Qdrant + rabbitmq_data: {} # RabbitMQ + +# ------------------------------ X-templates ---------------------------- +x-common-env: &env + RABBITMQ_USER: kb + TZ: "Etc/UTC" + + +x-worker-base: &worker-base + build: + context: . + dockerfile: docker/workers/Dockerfile + restart: unless-stopped + networks: [kbnet] + security_opt: ["no-new-privileges=true"] + depends_on: [postgres, rabbitmq, qdrant] + env_file: [.env] + environment: &worker-env + <<: *env + POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password + # DSNs are built in worker entrypoint from user + pw files + QDRANT_URL: http://qdrant:6333 + OPENAI_API_KEY_FILE: /run/secrets/openai_key + secrets: [postgres_password, openai_key] + read_only: true + tmpfs: [/tmp,/var/tmp] + cap_drop: [ALL] + logging: + options: {max-size: "10m", max-file: "3"} + +# ================================ SERVICES ============================ + +services: +# ----------------------------------------------------------------- data layer + postgres: + image: postgres:15 + restart: unless-stopped + networks: [kbnet] + environment: + <<: *env + POSTGRES_USER: kb + POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password + POSTGRES_DB: kb + secrets: [postgres_password] + volumes: + - db_data:/var/lib/postgresql/data:rw + - ./db:/docker-entrypoint-initdb.d:ro + healthcheck: + test: ["CMD-SHELL", "pg_isready -U kb"] + interval: 10s + timeout: 5s + retries: 5 + mem_limit: 4g + cpus: "1.5" + security_opt: ["no-new-privileges=true"] + + rabbitmq: + image: rabbitmq:3.13-management + restart: unless-stopped + networks: [kbnet] + environment: + <<: *env + RABBITMQ_DEFAULT_USER: "kb" + RABBITMQ_DEFAULT_PASS: "${RABBITMQ_PASSWORD}" + volumes: + - rabbitmq_data:/var/lib/rabbitmq:rw + healthcheck: + test: ["CMD", "rabbitmq-diagnostics", "ping"] + interval: 15s + timeout: 5s + retries: 5 + mem_limit: 512m + cpus: "0.5" + security_opt: ["no-new-privileges=true"] + ports: # UI only on localhost + - "127.0.0.1:15672:15672" + + qdrant: + image: qdrant/qdrant:v1.14.0 + restart: unless-stopped + networks: [kbnet] + volumes: + - qdrant_data:/qdrant/storage:rw + tmpfs: + - /tmp + - /var/tmp + - /qdrant/snapshots:rw + healthcheck: + test: ["CMD", "wget", "-q", "-T", "2", "-O", "-", "localhost:6333/ready"] + interval: 15s + timeout: 5s + retries: 5 + mem_limit: 4g + cpus: "2" + security_opt: ["no-new-privileges=true"] + cap_drop: [ALL] + +# ------------------------------------------------------------ API / gateway + # api: + # build: + # context: . + # dockerfile: docker/api/Dockerfile + # restart: unless-stopped + # networks: [kbnet] + # depends_on: [postgres, rabbitmq, qdrant] + # environment: + # <<: *env + # JWT_SECRET_FILE: /run/secrets/jwt_secret + # OPENAI_API_KEY_FILE: /run/secrets/openai_key + # POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password + # QDRANT_URL: http://qdrant:6333 + # secrets: [jwt_secret, openai_key, postgres_password] + # healthcheck: + # test: ["CMD-SHELL", "curl -fs http://localhost:8000/health || exit 1"] + # interval: 15s + # timeout: 5s + # retries: 5 + # mem_limit: 768m + # cpus: "1" + # labels: + # - "traefik.enable=true" + # - "traefik.http.routers.kb.rule=Host(`${TRAEFIK_DOMAIN}`)" + # - "traefik.http.routers.kb.entrypoints=websecure" + # - "traefik.http.services.kb.loadbalancer.server.port=8000" + + traefik: + image: traefik:v3.0 + restart: unless-stopped + networks: [kbnet] + command: + - "--providers.docker=true" + - "--providers.docker.network=kbnet" + - "--entrypoints.web.address=:80" + - "--entrypoints.websecure.address=:443" + # - "--certificatesresolvers.le.acme.httpchallenge=true" + # - "--certificatesresolvers.le.acme.httpchallenge.entrypoint=web" + # - "--certificatesresolvers.le.acme.email=${LE_EMAIL}" + # - "--certificatesresolvers.le.acme.storage=/acme.json" + - "--log.level=INFO" + ports: + - "80:80" + - "443:443" + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + # - ./acme.json:/acme.json:rw + +# ------------------------------------------------------------ Celery workers + worker-text: + <<: *worker-base + environment: + <<: *worker-env + QUEUES: "medium_embed" + deploy: {resources: {limits: {cpus: "2", memory: 3g}}} + + worker-photo: + <<: *worker-base + environment: + <<: *worker-env + QUEUES: "photo_embed" + deploy: {resources: {limits: {cpus: "4", memory: 4g}}} + + worker-ocr: + <<: *worker-base + environment: + <<: *worker-env + QUEUES: "low_ocr" + deploy: {resources: {limits: {cpus: "4", memory: 4g}}} + + worker-git: + <<: *worker-base + environment: + <<: *worker-env + QUEUES: "git_summary" + deploy: {resources: {limits: {cpus: "1", memory: 1g}}} + + worker-rss: + <<: *worker-base + environment: + <<: *worker-env + QUEUES: "rss" + deploy: {resources: {limits: {cpus: "0.5", memory: 512m}}} + + worker-docs: + <<: *worker-base + environment: + <<: *worker-env + QUEUES: "docs" + deploy: {resources: {limits: {cpus: "1", memory: 1g}}} + +# ------------------------------------------------------------ watchtower (auto-update) + watchtower: + image: containrrr/watchtower + restart: unless-stopped + command: ["--schedule", "0 0 4 * * *", "--cleanup"] + volumes: ["/var/run/docker.sock:/var/run/docker.sock:ro"] + networks: [kbnet] + +# ------------------------------------------------------------------- profiles: observability (opt-in) +# services: +# prometheus: +# image: prom/prometheus:v2.52 +# profiles: ["obs"] +# networks: [kbnet] +# volumes: [./observability/prometheus.yml:/etc/prometheus/prometheus.yml:ro] +# restart: unless-stopped +# ports: ["127.0.0.1:9090:9090"] + +# grafana: +# image: grafana/grafana:10 +# profiles: ["obs"] +# networks: [kbnet] +# volumes: [./observability/grafana:/var/lib/grafana] +# restart: unless-stopped +# environment: +# GF_SECURITY_ADMIN_USER: admin +# GF_SECURITY_ADMIN_PASSWORD_FILE: /run/secrets/grafana_pw +# secrets: [grafana_pw] +# ports: ["127.0.0.1:3000:3000"] + +# secrets: # extra secret for Grafana, not needed otherwise +# grafana_pw: +# file: ./secrets/grafana_pw.txt diff --git a/docker/api/Dockerfile b/docker/api/Dockerfile new file mode 100644 index 0000000..7535a5a --- /dev/null +++ b/docker/api/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.10-slim + +WORKDIR /app + +# Copy requirements files and setup +COPY requirements-*.txt ./ +COPY setup.py ./ +COPY src/ ./src/ + +# Install the package with API dependencies +RUN pip install -e ".[api]" + +# Run as non-root user +RUN useradd -m appuser +USER appuser + +# Set environment variables +ENV PORT=8000 +ENV PYTHONPATH="/app" + +EXPOSE 8000 + +# Run the API +CMD ["uvicorn", "memory.api.app:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/docker/workers/Dockerfile b/docker/workers/Dockerfile new file mode 100644 index 0000000..b0e9cae --- /dev/null +++ b/docker/workers/Dockerfile @@ -0,0 +1,28 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Copy requirements files and setup +COPY requirements-*.txt ./ +COPY setup.py ./ +COPY src/ ./src/ + +# Install dependencies +RUN apt-get update && apt-get install -y \ + libpq-dev gcc && \ + pip install -e ".[workers]" && \ + apt-get purge -y gcc && apt-get autoremove -y && rm -rf /var/lib/apt/lists/* + +# Create and copy entrypoint script +COPY docker/workers/entry.sh ./entry.sh +RUN chmod +x entry.sh + +# Create user and set permissions +RUN useradd -m kb && chown -R kb /app +USER kb + +# Default queues to process +ENV QUEUES="medium_embed,photo_embed,low_ocr,git_summary,rss,docs" +ENV PYTHONPATH="/app" + +ENTRYPOINT ["./entry.sh"] \ No newline at end of file diff --git a/docker/workers/entry.sh b/docker/workers/entry.sh new file mode 100644 index 0000000..778af8b --- /dev/null +++ b/docker/workers/entry.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +QUEUES=${QUEUES:-default} +CONCURRENCY=${CONCURRENCY:-2} +LOGLEVEL=${LOGLEVEL:-INFO} + +HOSTNAME="${QUEUES%@*}@$(hostname)" + +exec celery -A memory.workers.celery_app worker \ + -Q "${QUEUES}" \ + --concurrency="${CONCURRENCY}" \ + --hostname="${HOSTNAME}" \ + --loglevel="${LOGLEVEL}" \ No newline at end of file diff --git a/requirements-api.txt b/requirements-api.txt new file mode 100644 index 0000000..e7884d4 --- /dev/null +++ b/requirements-api.txt @@ -0,0 +1,4 @@ +fastapi==0.112.2 +uvicorn==0.29.0 +python-jose==3.3.0 +python-multipart==0.0.9 \ No newline at end of file diff --git a/requirements-common.txt b/requirements-common.txt new file mode 100644 index 0000000..965c2ba --- /dev/null +++ b/requirements-common.txt @@ -0,0 +1,3 @@ +sqlalchemy==2.0.30 +psycopg2-binary==2.9.9 +pydantic==2.7.1 \ No newline at end of file diff --git a/requirements-workers.txt b/requirements-workers.txt new file mode 100644 index 0000000..9423d63 --- /dev/null +++ b/requirements-workers.txt @@ -0,0 +1,4 @@ +celery==5.3.6 +openai==1.25.0 +pillow==10.3.0 +qdrant-client==1.9.0 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7965157 --- /dev/null +++ b/setup.py @@ -0,0 +1,31 @@ +import pathlib +from setuptools import setup, find_namespace_packages + + +def read_requirements(filename: str) -> list[str]: + """Read requirements from file, ignoring comments and -r directives.""" + filename = pathlib.Path(filename) + return [ + line.strip() + for line in filename.read_text().splitlines() + if line.strip() and not line.strip().startswith(('#', '-r')) + ] + + +# Read requirements files +common_requires = read_requirements('requirements-common.txt') +api_requires = read_requirements('requirements-api.txt') +workers_requires = read_requirements('requirements-workers.txt') + +setup( + name="memory", + version="0.1.0", + package_dir={"": "src"}, + packages=find_namespace_packages(where="src"), + python_requires=">=3.10", + extras_require={ + "api": api_requires + common_requires, + "workers": workers_requires + common_requires, + "common": common_requires, + }, +) \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..2ca6f8e --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,5 @@ +""" +Memory knowledge-base application. +""" + +__version__ = "0.1.0" diff --git a/src/memory/__init__.py b/src/memory/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/memory/__init__.py @@ -0,0 +1 @@ + diff --git a/src/memory/api/__init__.py b/src/memory/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/memory/api/app.py b/src/memory/api/app.py new file mode 100644 index 0000000..7c286b7 --- /dev/null +++ b/src/memory/api/app.py @@ -0,0 +1,50 @@ +""" +FastAPI application for the knowledge base. +""" +from fastapi import FastAPI, Depends, HTTPException +from sqlalchemy.orm import Session + +from memory.common.db import get_scoped_session +from memory.common.db.models import SourceItem + + +app = FastAPI(title="Knowledge Base API") + + +def get_db(): + """Database session dependency""" + db = get_scoped_session() + try: + yield db + finally: + db.close() + + +@app.get("/health") +def health_check(): + """Simple health check endpoint""" + return {"status": "healthy"} + + +@app.get("/sources") +def list_sources( + tag: str = None, + limit: int = 100, + db: Session = Depends(get_db) +): + """List source items, optionally filtered by tag""" + query = db.query(SourceItem) + + if tag: + query = query.filter(SourceItem.tags.contains([tag])) + + return query.limit(limit).all() + + +@app.get("/sources/{source_id}") +def get_source(source_id: int, db: Session = Depends(get_db)): + """Get a specific source by ID""" + source = db.query(SourceItem).filter(SourceItem.id == source_id).first() + if not source: + raise HTTPException(status_code=404, detail="Source not found") + return source \ No newline at end of file diff --git a/src/memory/common/__init__.py b/src/memory/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/memory/common/db/__init__.py b/src/memory/common/db/__init__.py new file mode 100644 index 0000000..7d49f32 --- /dev/null +++ b/src/memory/common/db/__init__.py @@ -0,0 +1,12 @@ +""" +Database utilities package. +""" +from memory.common.db.models import Base +from memory.common.db.connection import get_engine, get_session_factory, get_scoped_session + +__all__ = [ + "Base", + "get_engine", + "get_session_factory", + "get_scoped_session", +] \ No newline at end of file diff --git a/src/memory/common/db/connection.py b/src/memory/common/db/connection.py new file mode 100644 index 0000000..b21ea44 --- /dev/null +++ b/src/memory/common/db/connection.py @@ -0,0 +1,32 @@ +""" +Database connection utilities. +""" +import os + +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker, scoped_session + + +def get_engine(): + """Create SQLAlchemy engine from environment variables""" + user = os.getenv("POSTGRES_USER", "kb") + password = os.getenv("POSTGRES_PASSWORD", "kb") + host = os.getenv("POSTGRES_HOST", "postgres") + port = os.getenv("POSTGRES_PORT", "5432") + db = os.getenv("POSTGRES_DB", "kb") + + return create_engine(f"postgresql://{user}:{password}@{host}:{port}/{db}") + + +def get_session_factory(): + """Create a session factory for SQLAlchemy sessions""" + engine = get_engine() + session_factory = sessionmaker(bind=engine) + return session_factory + + +def get_scoped_session(): + """Create a thread-local scoped session factory""" + engine = get_engine() + session_factory = sessionmaker(bind=engine) + return scoped_session(session_factory) \ No newline at end of file diff --git a/src/memory/common/db/models.py b/src/memory/common/db/models.py new file mode 100644 index 0000000..20af8bb --- /dev/null +++ b/src/memory/common/db/models.py @@ -0,0 +1,127 @@ +""" +Database models for the knowledge base system. +""" +from sqlalchemy import ( + Column, ForeignKey, Integer, BigInteger, Text, DateTime, Boolean, Float, + ARRAY, func +) +from sqlalchemy.dialects.postgresql import BYTEA, JSONB, TSVECTOR +from sqlalchemy.ext.declarative import declarative_base + + +Base = declarative_base() + + +class SourceItem(Base): + __tablename__ = 'source_item' + + id = Column(BigInteger, primary_key=True) + modality = Column(Text, nullable=False) + sha256 = Column(BYTEA, nullable=False, unique=True) + inserted_at = Column(DateTime(timezone=True), server_default=func.now()) + tags = Column(ARRAY(Text), nullable=False, server_default='{}') + lang = Column(Text) + model_hash = Column(Text) + vector_ids = Column(ARRAY(Text), nullable=False, server_default='{}') + embed_status = Column(Text, nullable=False, server_default='RAW') + byte_length = Column(Integer) + mime_type = Column(Text) + + +class MailMessage(Base): + __tablename__ = 'mail_message' + + id = Column(BigInteger, primary_key=True) + source_id = Column(BigInteger, ForeignKey('source_item.id', ondelete='CASCADE'), nullable=False) + message_id = Column(Text, unique=True) + subject = Column(Text) + sender = Column(Text) + recipients = Column(ARRAY(Text)) + sent_at = Column(DateTime(timezone=True)) + body_raw = Column(Text) + attachments = Column(JSONB) + tsv = Column(TSVECTOR) + + +class ChatMessage(Base): + __tablename__ = 'chat_message' + + id = Column(BigInteger, primary_key=True) + source_id = Column(BigInteger, ForeignKey('source_item.id', ondelete='CASCADE'), nullable=False) + platform = Column(Text) + channel_id = Column(Text) + author = Column(Text) + sent_at = Column(DateTime(timezone=True)) + body_raw = Column(Text) + + +class GitCommit(Base): + __tablename__ = 'git_commit' + + id = Column(BigInteger, primary_key=True) + source_id = Column(BigInteger, ForeignKey('source_item.id', ondelete='CASCADE'), nullable=False) + repo_path = Column(Text) + commit_sha = Column(Text, unique=True) + author_name = Column(Text) + author_email = Column(Text) + author_date = Column(DateTime(timezone=True)) + msg_raw = Column(Text) + diff_summary = Column(Text) + files_changed = Column(ARRAY(Text)) + + +class Photo(Base): + __tablename__ = 'photo' + + id = Column(BigInteger, primary_key=True) + source_id = Column(BigInteger, ForeignKey('source_item.id', ondelete='CASCADE'), nullable=False) + file_path = Column(Text) + exif_taken_at = Column(DateTime(timezone=True)) + exif_lat = Column(Float) + exif_lon = Column(Float) + camera_make = Column(Text) + camera_model = Column(Text) + + +class BookDoc(Base): + __tablename__ = 'book_doc' + + id = Column(BigInteger, primary_key=True) + source_id = Column(BigInteger, ForeignKey('source_item.id', ondelete='CASCADE'), nullable=False) + title = Column(Text) + author = Column(Text) + chapter = Column(Text) + published = Column(DateTime) + + +class BlogPost(Base): + __tablename__ = 'blog_post' + + id = Column(BigInteger, primary_key=True) + source_id = Column(BigInteger, ForeignKey('source_item.id', ondelete='CASCADE'), nullable=False) + url = Column(Text, unique=True) + title = Column(Text) + published = Column(DateTime(timezone=True)) + + +class MiscDoc(Base): + __tablename__ = 'misc_doc' + + id = Column(BigInteger, primary_key=True) + source_id = Column(BigInteger, ForeignKey('source_item.id', ondelete='CASCADE'), nullable=False) + path = Column(Text) + mime_type = Column(Text) + + +class RssFeed(Base): + __tablename__ = 'rss_feeds' + + id = Column(BigInteger, primary_key=True) + url = Column(Text, nullable=False, unique=True) + title = Column(Text) + description = Column(Text) + tags = Column(ARRAY(Text), nullable=False, server_default='{}') + last_checked_at = Column(DateTime(timezone=True)) + active = Column(Boolean, nullable=False, server_default='true') + created_at = Column(DateTime(timezone=True), nullable=False, server_default=func.now()) + updated_at = Column(DateTime(timezone=True), nullable=False, server_default=func.now()) \ No newline at end of file diff --git a/src/memory/workers/__init__.py b/src/memory/workers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/memory/workers/celery_app.py b/src/memory/workers/celery_app.py new file mode 100644 index 0000000..901f298 --- /dev/null +++ b/src/memory/workers/celery_app.py @@ -0,0 +1,33 @@ +import os +from celery import Celery + + +def rabbit_url() -> str: + user = os.getenv("RABBITMQ_USER", "guest") + password = os.getenv("RABBITMQ_PASSWORD", "guest") + return f"amqp://{user}:{password}@rabbitmq:5672//" + + +app = Celery("memory", + broker=rabbit_url(), + backend=os.getenv("CELERY_RESULT_BACKEND", + "db+postgresql://kb:kb@postgres/kb")) + + +app.autodiscover_tasks(["memory.workers.tasks"]) + + +app.conf.update( + task_acks_late=True, + task_reject_on_worker_lost=True, + worker_prefetch_multiplier=1, + task_routes={ + # Task routing configuration + "memory.workers.tasks.text.*": {"queue": "medium_embed"}, + "memory.workers.tasks.photo.*": {"queue": "photo_embed"}, + "memory.workers.tasks.ocr.*": {"queue": "low_ocr"}, + "memory.workers.tasks.git.*": {"queue": "git_summary"}, + "memory.workers.tasks.rss.*": {"queue": "rss"}, + "memory.workers.tasks.docs.*": {"queue": "docs"}, + }, +) diff --git a/src/memory/workers/tasks/__init__.py b/src/memory/workers/tasks/__init__.py new file mode 100644 index 0000000..40f1244 --- /dev/null +++ b/src/memory/workers/tasks/__init__.py @@ -0,0 +1,4 @@ +""" +Import sub-modules so Celery can register their @app.task decorators. +""" +from memory.workers.tasks import text, photo, ocr, git, rss, docs # noqa \ No newline at end of file diff --git a/src/memory/workers/tasks/docs.py b/src/memory/workers/tasks/docs.py new file mode 100644 index 0000000..32cf5ea --- /dev/null +++ b/src/memory/workers/tasks/docs.py @@ -0,0 +1,5 @@ +from memory.workers.celery_app import app + +@app.task(name="kb.text.ping") +def ping(): + return "pong" \ No newline at end of file diff --git a/src/memory/workers/tasks/git.py b/src/memory/workers/tasks/git.py new file mode 100644 index 0000000..32cf5ea --- /dev/null +++ b/src/memory/workers/tasks/git.py @@ -0,0 +1,5 @@ +from memory.workers.celery_app import app + +@app.task(name="kb.text.ping") +def ping(): + return "pong" \ No newline at end of file diff --git a/src/memory/workers/tasks/ocr.py b/src/memory/workers/tasks/ocr.py new file mode 100644 index 0000000..32cf5ea --- /dev/null +++ b/src/memory/workers/tasks/ocr.py @@ -0,0 +1,5 @@ +from memory.workers.celery_app import app + +@app.task(name="kb.text.ping") +def ping(): + return "pong" \ No newline at end of file diff --git a/src/memory/workers/tasks/photo.py b/src/memory/workers/tasks/photo.py new file mode 100644 index 0000000..32cf5ea --- /dev/null +++ b/src/memory/workers/tasks/photo.py @@ -0,0 +1,5 @@ +from memory.workers.celery_app import app + +@app.task(name="kb.text.ping") +def ping(): + return "pong" \ No newline at end of file diff --git a/src/memory/workers/tasks/rss.py b/src/memory/workers/tasks/rss.py new file mode 100644 index 0000000..dfa37ce --- /dev/null +++ b/src/memory/workers/tasks/rss.py @@ -0,0 +1,6 @@ +from memory.workers.celery_app import app + + +@app.task(name="kb.text.ping") +def ping(): + return "pong" \ No newline at end of file diff --git a/src/memory/workers/tasks/text.py b/src/memory/workers/tasks/text.py new file mode 100644 index 0000000..0a34ff8 --- /dev/null +++ b/src/memory/workers/tasks/text.py @@ -0,0 +1,5 @@ +from memory.workers.celery_app import app + +@app.task(name="memory.text.ping") +def ping(): + return "pong" \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +