feat: add db schema, init, and vitest config
This commit is contained in:
parent
74cfc82bdd
commit
4fa3073412
13 changed files with 248 additions and 8 deletions
24
data-pipeline/db/index.ts
Normal file
24
data-pipeline/db/index.ts
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
import path from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import Database from "better-sqlite3";
|
||||
|
||||
// ── Paths ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
const DB_PATH = path.join(__dirname, "pipeline.db");
|
||||
|
||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
export type Db = InstanceType<typeof Database>;
|
||||
|
||||
// ── Open ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
export function openDb(): Db {
|
||||
const db = new Database(DB_PATH);
|
||||
|
||||
db.pragma("journal_mode = WAL");
|
||||
db.pragma("foreign_keys = ON");
|
||||
|
||||
return db;
|
||||
}
|
||||
39
data-pipeline/db/init.ts
Normal file
39
data-pipeline/db/init.ts
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import Database from "better-sqlite3";
|
||||
|
||||
// ── Paths ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
const PATHS = {
|
||||
schema: path.join(__dirname, "schema.sql"),
|
||||
db: path.join(__dirname, "pipeline.db"),
|
||||
};
|
||||
|
||||
// ── Init ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
export async function initDb(): Promise<void> {
|
||||
const schema = await fs.readFile(PATHS.schema, "utf-8");
|
||||
const db = new Database(PATHS.db);
|
||||
|
||||
db.pragma("journal_mode = WAL");
|
||||
db.pragma("foreign_keys = ON");
|
||||
db.exec(schema);
|
||||
db.close();
|
||||
|
||||
console.log(` pipeline.db initialised → ${PATHS.db}`);
|
||||
}
|
||||
|
||||
// ── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main(): Promise<void> {
|
||||
console.log("Initialising pipeline.db...");
|
||||
await initDb();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
BIN
data-pipeline/db/pipeline.db
Normal file
BIN
data-pipeline/db/pipeline.db
Normal file
Binary file not shown.
157
data-pipeline/db/schema.sql
Normal file
157
data-pipeline/db/schema.sql
Normal file
|
|
@ -0,0 +1,157 @@
|
|||
-- ── Base data ─────────────────────────────────────────────────────────────────
|
||||
-- Imported from stage 2 JSON on first run. Never mutated after import.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS synsets (
|
||||
source_id TEXT PRIMARY KEY,
|
||||
pos TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS translations (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
language TEXT NOT NULL,
|
||||
word TEXT NOT NULL,
|
||||
UNIQUE (source_id, language, word)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS glosses (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS examples (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
source TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS cefr_source_votes (
|
||||
id INTEGER PRIMARY KEY,
|
||||
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||
cefr_level TEXT NOT NULL,
|
||||
UNIQUE (translation_id)
|
||||
);
|
||||
|
||||
-- ── Status tracking ───────────────────────────────────────────────────────────
|
||||
-- One row per synset per model per stage. Drives resumability.
|
||||
-- stage: round1 | round2 | tiebreak
|
||||
-- status: pending | complete | needs_review | flagged
|
||||
|
||||
CREATE TABLE IF NOT EXISTS run_status (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
model_name TEXT NOT NULL,
|
||||
stage TEXT NOT NULL,
|
||||
status TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
UNIQUE (source_id, model_name, stage)
|
||||
);
|
||||
|
||||
-- ── Round 1 output ────────────────────────────────────────────────────────────
|
||||
-- One row per translation/language per model. Written atomically per record.
|
||||
-- Unique constraints enforce one model one vote.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS model_cefr_votes (
|
||||
id INTEGER PRIMARY KEY,
|
||||
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||
model_name TEXT NOT NULL,
|
||||
cefr_level TEXT NOT NULL,
|
||||
UNIQUE (translation_id, model_name)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS generated_glosses (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
model_name TEXT NOT NULL,
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
UNIQUE (source_id, model_name, language)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS generated_examples (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
model_name TEXT NOT NULL,
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
UNIQUE (source_id, model_name, language)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS generated_descriptions (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
model_name TEXT NOT NULL,
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
UNIQUE (source_id, model_name, language)
|
||||
);
|
||||
|
||||
-- ── Round 2 output ────────────────────────────────────────────────────────────
|
||||
-- Each row represents one model voting for one candidate.
|
||||
-- The candidate with the most votes wins in merge.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS gloss_candidate_votes (
|
||||
id INTEGER PRIMARY KEY,
|
||||
gloss_id INTEGER NOT NULL REFERENCES generated_glosses(id),
|
||||
model_name TEXT NOT NULL,
|
||||
UNIQUE (gloss_id, model_name)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS example_candidate_votes (
|
||||
id INTEGER PRIMARY KEY,
|
||||
example_id INTEGER NOT NULL REFERENCES generated_examples(id),
|
||||
model_name TEXT NOT NULL,
|
||||
UNIQUE (example_id, model_name)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS description_candidate_votes (
|
||||
id INTEGER PRIMARY KEY,
|
||||
description_id INTEGER NOT NULL REFERENCES generated_descriptions(id),
|
||||
model_name TEXT NOT NULL,
|
||||
UNIQUE (description_id, model_name)
|
||||
);
|
||||
|
||||
-- ── Resolved output ───────────────────────────────────────────────────────────
|
||||
-- Written by merge. Never updated after writing.
|
||||
-- Only fully resolved records are written here — no nulls, no flags.
|
||||
-- Absence of a row means unresolved. Flagged status tracked in run_status.
|
||||
-- source: omw | cefr | model_name
|
||||
|
||||
CREATE TABLE IF NOT EXISTS resolved_translations (
|
||||
id INTEGER PRIMARY KEY,
|
||||
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||
cefr_level TEXT NOT NULL,
|
||||
difficulty TEXT NOT NULL,
|
||||
UNIQUE (translation_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS resolved_glosses (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
UNIQUE (source_id, language)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS resolved_examples (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
source TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS resolved_descriptions (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
UNIQUE (source_id, language)
|
||||
);
|
||||
Loading…
Add table
Add a link
Reference in a new issue