memory/db/schema.sql
2025-04-27 14:31:53 +02:00

229 lines
9.1 KiB
PL/PgSQL
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*========================================================================
Knowledge-Base schema first-run script
---------------------------------------------------------------
• PostgreSQL 15+
• Creates every table, index, trigger and helper in one pass
• No ALTER statements or later migrations required
• Enable pgcrypto for UUID helpers (safe to re-run)
========================================================================*/
-------------------------------------------------------------------------------
-- 0. EXTENSIONS
-------------------------------------------------------------------------------
CREATE EXTENSION IF NOT EXISTS pgcrypto; -- gen_random_uuid(), crypt()
-------------------------------------------------------------------------------
-- 1. CANONICAL ARTEFACT TABLE (everything points here)
-------------------------------------------------------------------------------
CREATE TABLE source_item (
id BIGSERIAL PRIMARY KEY,
modality TEXT NOT NULL, -- 'mail'|'chat'|...
sha256 BYTEA UNIQUE NOT NULL, -- 32-byte blob
inserted_at TIMESTAMPTZ DEFAULT NOW(),
tags TEXT[] NOT NULL DEFAULT '{}', -- flexible labels
lang TEXT, -- ISO-639-1 or NULL
model_hash TEXT, -- embedding model ver.
vector_ids TEXT[] NOT NULL DEFAULT '{}', -- 0-N Qdrant IDs
embed_status TEXT NOT NULL DEFAULT 'RAW'
CHECK (embed_status IN ('RAW','QUEUED','STORED','FAILED')),
byte_length INTEGER, -- original size
mime_type TEXT
);
CREATE INDEX source_modality_idx ON source_item (modality);
CREATE INDEX source_status_idx ON source_item (embed_status);
CREATE INDEX source_tags_idx ON source_item USING GIN (tags);
-- 1.a Trigger vector_ids must be present when status = STORED
CREATE OR REPLACE FUNCTION trg_vector_ids_not_empty()
RETURNS TRIGGER LANGUAGE plpgsql AS $$
BEGIN
IF NEW.embed_status = 'STORED'
AND (NEW.vector_ids IS NULL OR array_length(NEW.vector_ids,1) = 0) THEN
RAISE EXCEPTION
USING MESSAGE = 'vector_ids must not be empty when embed_status = STORED';
END IF;
RETURN NEW;
END;
$$;
CREATE TRIGGER check_vector_ids
BEFORE UPDATE ON source_item
FOR EACH ROW EXECUTE FUNCTION trg_vector_ids_not_empty();
-------------------------------------------------------------------------------
-- 2. MAIL MESSAGES
-------------------------------------------------------------------------------
CREATE TABLE mail_message (
id BIGSERIAL PRIMARY KEY,
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
message_id TEXT UNIQUE,
subject TEXT,
sender TEXT,
recipients TEXT[],
sent_at TIMESTAMPTZ,
body_raw TEXT,
attachments JSONB
);
CREATE INDEX mail_sent_idx ON mail_message (sent_at);
CREATE INDEX mail_recipients_idx ON mail_message USING GIN (recipients);
ALTER TABLE mail_message
ADD COLUMN tsv tsvector
GENERATED ALWAYS AS (
to_tsvector('english',
coalesce(subject,'') || ' ' || coalesce(body_raw,'')))
STORED;
CREATE INDEX mail_tsv_idx ON mail_message USING GIN (tsv);
-------------------------------------------------------------------------------
-- 3. CHAT (Slack / Discord)
-------------------------------------------------------------------------------
CREATE TABLE chat_message (
id BIGSERIAL PRIMARY KEY,
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
platform TEXT CHECK (platform IN ('slack','discord')),
channel_id TEXT,
author TEXT,
sent_at TIMESTAMPTZ,
body_raw TEXT
);
CREATE INDEX chat_channel_idx ON chat_message (platform, channel_id);
-------------------------------------------------------------------------------
-- 4. GIT COMMITS (local repos)
-------------------------------------------------------------------------------
CREATE TABLE git_commit (
id BIGSERIAL PRIMARY KEY,
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
repo_path TEXT,
commit_sha TEXT UNIQUE,
author_name TEXT,
author_email TEXT,
author_date TIMESTAMPTZ,
msg_raw TEXT,
diff_summary TEXT,
files_changed TEXT[]
);
CREATE INDEX git_files_idx ON git_commit USING GIN (files_changed);
CREATE INDEX git_date_idx ON git_commit (author_date);
-------------------------------------------------------------------------------
-- 5. PHOTOS
-------------------------------------------------------------------------------
CREATE TABLE photo (
id BIGSERIAL PRIMARY KEY,
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
file_path TEXT,
exif_taken_at TIMESTAMPTZ,
exif_lat NUMERIC(9,6),
exif_lon NUMERIC(9,6),
camera_make TEXT,
camera_model TEXT
);
CREATE INDEX photo_taken_idx ON photo (exif_taken_at);
-------------------------------------------------------------------------------
-- 6. BOOKS, BLOG POSTS, MISC DOCS
-------------------------------------------------------------------------------
CREATE TABLE book_doc (
id BIGSERIAL PRIMARY KEY,
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
title TEXT,
author TEXT,
chapter TEXT,
published DATE
);
CREATE TABLE blog_post (
id BIGSERIAL PRIMARY KEY,
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
url TEXT UNIQUE,
title TEXT,
published TIMESTAMPTZ
);
CREATE TABLE misc_doc (
id BIGSERIAL PRIMARY KEY,
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
path TEXT,
mime_type TEXT
);
-------------------------------------------------------------------------------
-- 6.5 RSS FEEDS
-------------------------------------------------------------------------------
CREATE TABLE rss_feeds (
id BIGSERIAL PRIMARY KEY,
url TEXT UNIQUE NOT NULL,
title TEXT,
description TEXT,
tags TEXT[] NOT NULL DEFAULT '{}',
last_checked_at TIMESTAMPTZ,
active BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX rss_feeds_active_idx ON rss_feeds (active, last_checked_at);
CREATE INDEX rss_feeds_tags_idx ON rss_feeds USING GIN (tags);
-------------------------------------------------------------------------------
-- 7. GITHUB ITEMS (issues, PRs, comments, project cards)
-------------------------------------------------------------------------------
CREATE TYPE gh_item_kind AS ENUM ('issue','pr','comment','project_card');
CREATE TABLE github_item (
id BIGSERIAL PRIMARY KEY,
source_id BIGINT NOT NULL REFERENCES source_item ON DELETE CASCADE,
kind gh_item_kind NOT NULL,
repo_path TEXT NOT NULL, -- "owner/repo"
number INTEGER, -- issue/PR number (NULL for commit comment)
parent_number INTEGER, -- comment → its issue/PR
commit_sha TEXT, -- for commit comments
state TEXT, -- 'open'|'closed'|'merged'
title TEXT,
body_raw TEXT,
labels TEXT[],
author TEXT,
created_at TIMESTAMPTZ,
closed_at TIMESTAMPTZ,
merged_at TIMESTAMPTZ,
diff_summary TEXT, -- PR only
payload JSONB -- extra GitHub fields
);
CREATE INDEX gh_repo_kind_idx ON github_item (repo_path, kind);
CREATE INDEX gh_issue_lookup_idx ON github_item (repo_path, kind, number);
CREATE INDEX gh_labels_idx ON github_item USING GIN (labels);
CREATE INDEX gh_tsv_idx ON github_item
WHERE kind IN ('issue','pr')
USING GIN (to_tsvector('english',
coalesce(title,'') || ' ' || coalesce(body_raw,'')));
-------------------------------------------------------------------------------
-- 8. HELPER FUNCTION add tags
-------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION add_tags(p_source BIGINT, p_tags TEXT[])
RETURNS VOID LANGUAGE SQL AS $$
UPDATE source_item
SET tags =
(SELECT ARRAY(SELECT DISTINCT unnest(tags || p_tags)))
WHERE id = p_source;
$$;
-------------------------------------------------------------------------------
-- 9. (optional) PARTITION STUBS create per-year partitions later
-------------------------------------------------------------------------------
/*
-- example:
CREATE TABLE mail_message_2026 PARTITION OF mail_message
FOR VALUES FROM ('2026-01-01') TO ('2027-01-01');
*/
-- =========================================================================
-- Schema creation complete
-- =========================================================================