feat: add db schema, init, and vitest config

This commit is contained in:
lila 2026-05-03 17:56:29 +02:00
parent 74cfc82bdd
commit 4fa3073412
13 changed files with 248 additions and 8 deletions

24
data-pipeline/db/index.ts Normal file
View file

@ -0,0 +1,24 @@
import path from "node:path";
import { fileURLToPath } from "node:url";
import Database from "better-sqlite3";
// ── Paths ─────────────────────────────────────────────────────────────────────
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const DB_PATH = path.join(__dirname, "pipeline.db");
// ── Types ─────────────────────────────────────────────────────────────────────
export type Db = InstanceType<typeof Database>;
// ── Open ──────────────────────────────────────────────────────────────────────
export function openDb(): Db {
const db = new Database(DB_PATH);
db.pragma("journal_mode = WAL");
db.pragma("foreign_keys = ON");
return db;
}

39
data-pipeline/db/init.ts Normal file
View file

@ -0,0 +1,39 @@
import fs from "node:fs/promises";
import path from "node:path";
import { fileURLToPath } from "node:url";
import Database from "better-sqlite3";
// ── Paths ─────────────────────────────────────────────────────────────────────
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PATHS = {
schema: path.join(__dirname, "schema.sql"),
db: path.join(__dirname, "pipeline.db"),
};
// ── Init ──────────────────────────────────────────────────────────────────────
export async function initDb(): Promise<void> {
const schema = await fs.readFile(PATHS.schema, "utf-8");
const db = new Database(PATHS.db);
db.pragma("journal_mode = WAL");
db.pragma("foreign_keys = ON");
db.exec(schema);
db.close();
console.log(` pipeline.db initialised → ${PATHS.db}`);
}
// ── Main ─────────────────────────────────────────────────────────────────────
async function main(): Promise<void> {
console.log("Initialising pipeline.db...");
await initDb();
}
main().catch((err) => {
console.error(err);
process.exit(1);
});

Binary file not shown.

157
data-pipeline/db/schema.sql Normal file
View file

@ -0,0 +1,157 @@
-- ── Base data ─────────────────────────────────────────────────────────────────
-- Imported from stage 2 JSON on first run. Never mutated after import.
CREATE TABLE IF NOT EXISTS synsets (
source_id TEXT PRIMARY KEY,
pos TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS translations (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
word TEXT NOT NULL,
UNIQUE (source_id, language, word)
);
CREATE TABLE IF NOT EXISTS glosses (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS examples (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS cefr_source_votes (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
cefr_level TEXT NOT NULL,
UNIQUE (translation_id)
);
-- ── Status tracking ───────────────────────────────────────────────────────────
-- One row per synset per model per stage. Drives resumability.
-- stage: round1 | round2 | tiebreak
-- status: pending | complete | needs_review | flagged
CREATE TABLE IF NOT EXISTS run_status (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
model_name TEXT NOT NULL,
stage TEXT NOT NULL,
status TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
UNIQUE (source_id, model_name, stage)
);
-- ── Round 1 output ────────────────────────────────────────────────────────────
-- One row per translation/language per model. Written atomically per record.
-- Unique constraints enforce one model one vote.
CREATE TABLE IF NOT EXISTS model_cefr_votes (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
model_name TEXT NOT NULL,
cefr_level TEXT NOT NULL,
UNIQUE (translation_id, model_name)
);
CREATE TABLE IF NOT EXISTS generated_glosses (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
model_name TEXT NOT NULL,
language TEXT NOT NULL,
text TEXT NOT NULL,
UNIQUE (source_id, model_name, language)
);
CREATE TABLE IF NOT EXISTS generated_examples (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
model_name TEXT NOT NULL,
language TEXT NOT NULL,
text TEXT NOT NULL,
UNIQUE (source_id, model_name, language)
);
CREATE TABLE IF NOT EXISTS generated_descriptions (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
model_name TEXT NOT NULL,
language TEXT NOT NULL,
text TEXT NOT NULL,
UNIQUE (source_id, model_name, language)
);
-- ── Round 2 output ────────────────────────────────────────────────────────────
-- Each row represents one model voting for one candidate.
-- The candidate with the most votes wins in merge.
CREATE TABLE IF NOT EXISTS gloss_candidate_votes (
id INTEGER PRIMARY KEY,
gloss_id INTEGER NOT NULL REFERENCES generated_glosses(id),
model_name TEXT NOT NULL,
UNIQUE (gloss_id, model_name)
);
CREATE TABLE IF NOT EXISTS example_candidate_votes (
id INTEGER PRIMARY KEY,
example_id INTEGER NOT NULL REFERENCES generated_examples(id),
model_name TEXT NOT NULL,
UNIQUE (example_id, model_name)
);
CREATE TABLE IF NOT EXISTS description_candidate_votes (
id INTEGER PRIMARY KEY,
description_id INTEGER NOT NULL REFERENCES generated_descriptions(id),
model_name TEXT NOT NULL,
UNIQUE (description_id, model_name)
);
-- ── Resolved output ───────────────────────────────────────────────────────────
-- Written by merge. Never updated after writing.
-- Only fully resolved records are written here — no nulls, no flags.
-- Absence of a row means unresolved. Flagged status tracked in run_status.
-- source: omw | cefr | model_name
CREATE TABLE IF NOT EXISTS resolved_translations (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
cefr_level TEXT NOT NULL,
difficulty TEXT NOT NULL,
UNIQUE (translation_id)
);
CREATE TABLE IF NOT EXISTS resolved_glosses (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL,
UNIQUE (source_id, language)
);
CREATE TABLE IF NOT EXISTS resolved_examples (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS resolved_descriptions (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL,
UNIQUE (source_id, language)
);

View file

@ -3,7 +3,11 @@
"version": "1.0.0", "version": "1.0.0",
"private": true, "private": true,
"type": "module", "type": "module",
"scripts": {}, "scripts": {
"db:init": "tsx db/init.ts",
"test": "vitest run",
"test:watch": "vitest"
},
"dependencies": { "dependencies": {
"@lila/shared": "workspace:*", "@lila/shared": "workspace:*",
"better-sqlite3": "^12.9.0" "better-sqlite3": "^12.9.0"
@ -12,6 +16,7 @@
"@types/better-sqlite3": "^7.6.13", "@types/better-sqlite3": "^7.6.13",
"@types/node": "^24.12.0", "@types/node": "^24.12.0",
"tsx": "^4.21.0", "tsx": "^4.21.0",
"typescript": "^5.9.3" "typescript": "^5.9.3",
"vitest": "^4.1.0"
} }
} }

View file

@ -154,7 +154,7 @@ async function loadAnnotated(): Promise<AnnotatedRecord[]> {
for (const [l, examples] of Object.entries(record.examples)) { for (const [l, examples] of Object.entries(record.examples)) {
const lang = l as SupportedLanguageCode; const lang = l as SupportedLanguageCode;
if (!base.examples[lang]) { if (!base.examples[lang]) {
base.examples[lang] = examples as Example[]; base.examples[lang] = examples;
} }
} }
} }

View file

@ -8,5 +8,5 @@
"types": ["node"] "types": ["node"]
}, },
"references": [{ "path": "../packages/shared" }], "references": [{ "path": "../packages/shared" }],
"include": ["./**/*"] "include": ["./**/*", "vitest.config.ts"]
} }

View file

@ -0,0 +1,10 @@
import { defineConfig } from "vitest/config";
export default defineConfig({
test: {
environment: "node",
globals: true,
include: ["tests/**/*.test.ts"],
exclude: ["**/dist/**", "**/node_modules/**"],
},
});

View file

@ -528,6 +528,7 @@ llama.cpp is not installed.
**Next action:** Write the round 1 generation script. **Next action:** Write the round 1 generation script.
- [ ] Write tests for stage 3
- [ ] Write round 1 script (generation) - [ ] Write round 1 script (generation)
- [ ] Write compile-candidates script - [ ] Write compile-candidates script
- [ ] Write round 2 script (voting) - [ ] Write round 2 script (voting)
@ -542,24 +543,28 @@ llama.cpp is not installed.
### Stage 4 — Merge `🔲 not started` ### Stage 4 — Merge `🔲 not started`
- [ ] Write tests for stage 3
- [ ] Write merge script - [ ] Write merge script
- [ ] Run merge → `pipeline.db` - [ ] Run merge → `pipeline.db`
- [ ] Confirm tiebreaker resolves all flagged translations - [ ] Confirm tiebreaker resolves all flagged translations
### Stage 4b — Tiebreak `🔲 not started` ### Stage 4b — Tiebreak `🔲 not started`
- [ ] Write tests for stage 3
- [ ] Write tiebreak logic - [ ] Write tiebreak logic
- [ ] Run tiebreaker for all flagged translations - [ ] Run tiebreaker for all flagged translations
- [ ] Confirm no flagged translations remain before seeding - [ ] Confirm no flagged translations remain before seeding
### Stage 5 — Compare / QA `🔲 not started` ### Stage 5 — Compare / QA `🔲 not started`
- [ ] Write tests for stage 3
- [ ] Write compare script - [ ] Write compare script
- [ ] Run compare → `COVERAGE.md` - [ ] Run compare → `COVERAGE.md`
- [ ] Review output quality before seeding - [ ] Review output quality before seeding
### Stage 6 — Sync `🔲 not started` ### Stage 6 — Sync `🔲 not started`
- [ ] Write tests for stage 3
- [ ] Write sync script - [ ] Write sync script
- [ ] Configure `DATABASE_URL` in `.env` - [ ] Configure `DATABASE_URL` in `.env`
- [ ] Run sync → production PostgreSQL - [ ] Run sync → production PostgreSQL

View file

@ -12,7 +12,6 @@ export default defineConfig([
"node_modules/", "node_modules/",
"routeTree.gen.ts", "routeTree.gen.ts",
"scripts/**", "scripts/**",
"data-pipeline/**/*",
]), ]),
eslint.configs.recommended, eslint.configs.recommended,

View file

@ -23,7 +23,7 @@
"prettier --write" "prettier --write"
] ]
}, },
"packageManager": "pnpm@10.33.1", "packageManager": "pnpm@10.33.2",
"devDependencies": { "devDependencies": {
"@eslint/js": "^10.0.1", "@eslint/js": "^10.0.1",
"@tanstack/eslint-plugin-router": "^1.161.6", "@tanstack/eslint-plugin-router": "^1.161.6",

5
pnpm-lock.yaml generated
View file

@ -173,6 +173,9 @@ importers:
typescript: typescript:
specifier: ^5.9.3 specifier: ^5.9.3
version: 5.9.3 version: 5.9.3
vitest:
specifier: ^4.1.0
version: 4.1.0(@opentelemetry/api@1.9.1)(@types/node@24.12.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3))
packages/db: packages/db:
dependencies: dependencies:
@ -4391,7 +4394,6 @@ snapshots:
magic-string: 0.30.21 magic-string: 0.30.21
optionalDependencies: optionalDependencies:
vite: 8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3) vite: 8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)
optional: true
'@vitest/mocker@4.1.0(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3))': '@vitest/mocker@4.1.0(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3))':
dependencies: dependencies:
@ -6136,7 +6138,6 @@ snapshots:
jsdom: 29.0.1(@noble/hashes@2.2.0) jsdom: 29.0.1(@noble/hashes@2.2.0)
transitivePeerDependencies: transitivePeerDependencies:
- msw - msw
optional: true
vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)): vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)):
dependencies: dependencies: