diff --git a/data-pipeline/db/index.ts b/data-pipeline/db/index.ts new file mode 100644 index 0000000..f0ce57d --- /dev/null +++ b/data-pipeline/db/index.ts @@ -0,0 +1,24 @@ +import path from "node:path"; +import { fileURLToPath } from "node:url"; +import Database from "better-sqlite3"; + +// ── Paths ───────────────────────────────────────────────────────────────────── + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const DB_PATH = path.join(__dirname, "pipeline.db"); + +// ── Types ───────────────────────────────────────────────────────────────────── + +export type Db = InstanceType; + +// ── Open ────────────────────────────────────────────────────────────────────── + +export function openDb(): Db { + const db = new Database(DB_PATH); + + db.pragma("journal_mode = WAL"); + db.pragma("foreign_keys = ON"); + + return db; +} diff --git a/data-pipeline/db/init.ts b/data-pipeline/db/init.ts new file mode 100644 index 0000000..f85d213 --- /dev/null +++ b/data-pipeline/db/init.ts @@ -0,0 +1,39 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; +import Database from "better-sqlite3"; + +// ── Paths ───────────────────────────────────────────────────────────────────── + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const PATHS = { + schema: path.join(__dirname, "schema.sql"), + db: path.join(__dirname, "pipeline.db"), +}; + +// ── Init ────────────────────────────────────────────────────────────────────── + +export async function initDb(): Promise { + const schema = await fs.readFile(PATHS.schema, "utf-8"); + const db = new Database(PATHS.db); + + db.pragma("journal_mode = WAL"); + db.pragma("foreign_keys = ON"); + db.exec(schema); + db.close(); + + console.log(` pipeline.db initialised → ${PATHS.db}`); +} + +// ── Main ───────────────────────────────────────────────────────────────────── + +async function main(): Promise { + console.log("Initialising pipeline.db..."); + await initDb(); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/data-pipeline/db/pipeline.db b/data-pipeline/db/pipeline.db new file mode 100644 index 0000000..f8dd7fc Binary files /dev/null and b/data-pipeline/db/pipeline.db differ diff --git a/data-pipeline/db/schema.sql b/data-pipeline/db/schema.sql new file mode 100644 index 0000000..7441bb1 --- /dev/null +++ b/data-pipeline/db/schema.sql @@ -0,0 +1,157 @@ +-- ── Base data ───────────────────────────────────────────────────────────────── +-- Imported from stage 2 JSON on first run. Never mutated after import. + +CREATE TABLE IF NOT EXISTS synsets ( + source_id TEXT PRIMARY KEY, + pos TEXT NOT NULL +); + +CREATE TABLE IF NOT EXISTS translations ( + id INTEGER PRIMARY KEY, + source_id TEXT NOT NULL REFERENCES synsets(source_id), + language TEXT NOT NULL, + word TEXT NOT NULL, + UNIQUE (source_id, language, word) +); + +CREATE TABLE IF NOT EXISTS glosses ( + id INTEGER PRIMARY KEY, + source_id TEXT NOT NULL REFERENCES synsets(source_id), + language TEXT NOT NULL, + text TEXT NOT NULL +); + +CREATE TABLE IF NOT EXISTS examples ( + id INTEGER PRIMARY KEY, + source_id TEXT NOT NULL REFERENCES synsets(source_id), + language TEXT NOT NULL, + text TEXT NOT NULL, + source TEXT NOT NULL +); + +CREATE TABLE IF NOT EXISTS cefr_source_votes ( + id INTEGER PRIMARY KEY, + translation_id INTEGER NOT NULL REFERENCES translations(id), + cefr_level TEXT NOT NULL, + UNIQUE (translation_id) +); + +-- ── Status tracking ─────────────────────────────────────────────────────────── +-- One row per synset per model per stage. Drives resumability. +-- stage: round1 | round2 | tiebreak +-- status: pending | complete | needs_review | flagged + +CREATE TABLE IF NOT EXISTS run_status ( + id INTEGER PRIMARY KEY, + source_id TEXT NOT NULL REFERENCES synsets(source_id), + model_name TEXT NOT NULL, + stage TEXT NOT NULL, + status TEXT NOT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + UNIQUE (source_id, model_name, stage) +); + +-- ── Round 1 output ──────────────────────────────────────────────────────────── +-- One row per translation/language per model. Written atomically per record. +-- Unique constraints enforce one model one vote. + +CREATE TABLE IF NOT EXISTS model_cefr_votes ( + id INTEGER PRIMARY KEY, + translation_id INTEGER NOT NULL REFERENCES translations(id), + model_name TEXT NOT NULL, + cefr_level TEXT NOT NULL, + UNIQUE (translation_id, model_name) +); + +CREATE TABLE IF NOT EXISTS generated_glosses ( + id INTEGER PRIMARY KEY, + source_id TEXT NOT NULL REFERENCES synsets(source_id), + model_name TEXT NOT NULL, + language TEXT NOT NULL, + text TEXT NOT NULL, + UNIQUE (source_id, model_name, language) +); + +CREATE TABLE IF NOT EXISTS generated_examples ( + id INTEGER PRIMARY KEY, + source_id TEXT NOT NULL REFERENCES synsets(source_id), + model_name TEXT NOT NULL, + language TEXT NOT NULL, + text TEXT NOT NULL, + UNIQUE (source_id, model_name, language) +); + +CREATE TABLE IF NOT EXISTS generated_descriptions ( + id INTEGER PRIMARY KEY, + source_id TEXT NOT NULL REFERENCES synsets(source_id), + model_name TEXT NOT NULL, + language TEXT NOT NULL, + text TEXT NOT NULL, + UNIQUE (source_id, model_name, language) +); + +-- ── Round 2 output ──────────────────────────────────────────────────────────── +-- Each row represents one model voting for one candidate. +-- The candidate with the most votes wins in merge. + +CREATE TABLE IF NOT EXISTS gloss_candidate_votes ( + id INTEGER PRIMARY KEY, + gloss_id INTEGER NOT NULL REFERENCES generated_glosses(id), + model_name TEXT NOT NULL, + UNIQUE (gloss_id, model_name) +); + +CREATE TABLE IF NOT EXISTS example_candidate_votes ( + id INTEGER PRIMARY KEY, + example_id INTEGER NOT NULL REFERENCES generated_examples(id), + model_name TEXT NOT NULL, + UNIQUE (example_id, model_name) +); + +CREATE TABLE IF NOT EXISTS description_candidate_votes ( + id INTEGER PRIMARY KEY, + description_id INTEGER NOT NULL REFERENCES generated_descriptions(id), + model_name TEXT NOT NULL, + UNIQUE (description_id, model_name) +); + +-- ── Resolved output ─────────────────────────────────────────────────────────── +-- Written by merge. Never updated after writing. +-- Only fully resolved records are written here — no nulls, no flags. +-- Absence of a row means unresolved. Flagged status tracked in run_status. +-- source: omw | cefr | model_name + +CREATE TABLE IF NOT EXISTS resolved_translations ( + id INTEGER PRIMARY KEY, + translation_id INTEGER NOT NULL REFERENCES translations(id), + cefr_level TEXT NOT NULL, + difficulty TEXT NOT NULL, + UNIQUE (translation_id) +); + +CREATE TABLE IF NOT EXISTS resolved_glosses ( + id INTEGER PRIMARY KEY, + source_id TEXT NOT NULL REFERENCES synsets(source_id), + language TEXT NOT NULL, + text TEXT NOT NULL, + source TEXT NOT NULL, + UNIQUE (source_id, language) +); + +CREATE TABLE IF NOT EXISTS resolved_examples ( + id INTEGER PRIMARY KEY, + source_id TEXT NOT NULL REFERENCES synsets(source_id), + language TEXT NOT NULL, + text TEXT NOT NULL, + source TEXT NOT NULL +); + +CREATE TABLE IF NOT EXISTS resolved_descriptions ( + id INTEGER PRIMARY KEY, + source_id TEXT NOT NULL REFERENCES synsets(source_id), + language TEXT NOT NULL, + text TEXT NOT NULL, + source TEXT NOT NULL, + UNIQUE (source_id, language) +); diff --git a/data-pipeline/package.json b/data-pipeline/package.json index 1d27a98..b985f86 100644 --- a/data-pipeline/package.json +++ b/data-pipeline/package.json @@ -3,7 +3,11 @@ "version": "1.0.0", "private": true, "type": "module", - "scripts": {}, + "scripts": { + "db:init": "tsx db/init.ts", + "test": "vitest run", + "test:watch": "vitest" + }, "dependencies": { "@lila/shared": "workspace:*", "better-sqlite3": "^12.9.0" @@ -12,6 +16,7 @@ "@types/better-sqlite3": "^7.6.13", "@types/node": "^24.12.0", "tsx": "^4.21.0", - "typescript": "^5.9.3" + "typescript": "^5.9.3", + "vitest": "^4.1.0" } } diff --git a/data-pipeline/test/output/sample.json b/data-pipeline/sample/output/sample.json similarity index 100% rename from data-pipeline/test/output/sample.json rename to data-pipeline/sample/output/sample.json diff --git a/data-pipeline/test/scripts/sample.ts b/data-pipeline/sample/scripts/sample.ts similarity index 99% rename from data-pipeline/test/scripts/sample.ts rename to data-pipeline/sample/scripts/sample.ts index 63ead71..9aece55 100644 --- a/data-pipeline/test/scripts/sample.ts +++ b/data-pipeline/sample/scripts/sample.ts @@ -154,7 +154,7 @@ async function loadAnnotated(): Promise { for (const [l, examples] of Object.entries(record.examples)) { const lang = l as SupportedLanguageCode; if (!base.examples[lang]) { - base.examples[lang] = examples as Example[]; + base.examples[lang] = examples; } } } diff --git a/data-pipeline/tsconfig.json b/data-pipeline/tsconfig.json index 7752b6c..5273064 100644 --- a/data-pipeline/tsconfig.json +++ b/data-pipeline/tsconfig.json @@ -8,5 +8,5 @@ "types": ["node"] }, "references": [{ "path": "../packages/shared" }], - "include": ["./**/*"] + "include": ["./**/*", "vitest.config.ts"] } diff --git a/data-pipeline/vitest.config.ts b/data-pipeline/vitest.config.ts new file mode 100644 index 0000000..bdc68ba --- /dev/null +++ b/data-pipeline/vitest.config.ts @@ -0,0 +1,10 @@ +import { defineConfig } from "vitest/config"; + +export default defineConfig({ + test: { + environment: "node", + globals: true, + include: ["tests/**/*.test.ts"], + exclude: ["**/dist/**", "**/node_modules/**"], + }, +}); diff --git a/documentation/data-pipeline.md b/documentation/data-pipeline.md index bdb5971..543bca7 100644 --- a/documentation/data-pipeline.md +++ b/documentation/data-pipeline.md @@ -528,6 +528,7 @@ llama.cpp is not installed. **Next action:** Write the round 1 generation script. +- [ ] Write tests for stage 3 - [ ] Write round 1 script (generation) - [ ] Write compile-candidates script - [ ] Write round 2 script (voting) @@ -542,24 +543,28 @@ llama.cpp is not installed. ### Stage 4 — Merge `🔲 not started` +- [ ] Write tests for stage 3 - [ ] Write merge script - [ ] Run merge → `pipeline.db` - [ ] Confirm tiebreaker resolves all flagged translations ### Stage 4b — Tiebreak `🔲 not started` +- [ ] Write tests for stage 3 - [ ] Write tiebreak logic - [ ] Run tiebreaker for all flagged translations - [ ] Confirm no flagged translations remain before seeding ### Stage 5 — Compare / QA `🔲 not started` +- [ ] Write tests for stage 3 - [ ] Write compare script - [ ] Run compare → `COVERAGE.md` - [ ] Review output quality before seeding ### Stage 6 — Sync `🔲 not started` +- [ ] Write tests for stage 3 - [ ] Write sync script - [ ] Configure `DATABASE_URL` in `.env` - [ ] Run sync → production PostgreSQL diff --git a/eslint.config.mjs b/eslint.config.mjs index a88b6f1..386ffe9 100644 --- a/eslint.config.mjs +++ b/eslint.config.mjs @@ -12,7 +12,6 @@ export default defineConfig([ "node_modules/", "routeTree.gen.ts", "scripts/**", - "data-pipeline/**/*", ]), eslint.configs.recommended, diff --git a/package.json b/package.json index 6a30e1d..3766166 100644 --- a/package.json +++ b/package.json @@ -23,7 +23,7 @@ "prettier --write" ] }, - "packageManager": "pnpm@10.33.1", + "packageManager": "pnpm@10.33.2", "devDependencies": { "@eslint/js": "^10.0.1", "@tanstack/eslint-plugin-router": "^1.161.6", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 4453586..6d416f3 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -173,6 +173,9 @@ importers: typescript: specifier: ^5.9.3 version: 5.9.3 + vitest: + specifier: ^4.1.0 + version: 4.1.0(@opentelemetry/api@1.9.1)(@types/node@24.12.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)) packages/db: dependencies: @@ -4391,7 +4394,6 @@ snapshots: magic-string: 0.30.21 optionalDependencies: vite: 8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3) - optional: true '@vitest/mocker@4.1.0(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3))': dependencies: @@ -6136,7 +6138,6 @@ snapshots: jsdom: 29.0.1(@noble/hashes@2.2.0) transitivePeerDependencies: - msw - optional: true vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)): dependencies: