feat: add db schema, init, and vitest config
This commit is contained in:
parent
74cfc82bdd
commit
4fa3073412
13 changed files with 248 additions and 8 deletions
24
data-pipeline/db/index.ts
Normal file
24
data-pipeline/db/index.ts
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
import path from "node:path";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
import Database from "better-sqlite3";
|
||||||
|
|
||||||
|
// ── Paths ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
|
const DB_PATH = path.join(__dirname, "pipeline.db");
|
||||||
|
|
||||||
|
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export type Db = InstanceType<typeof Database>;
|
||||||
|
|
||||||
|
// ── Open ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export function openDb(): Db {
|
||||||
|
const db = new Database(DB_PATH);
|
||||||
|
|
||||||
|
db.pragma("journal_mode = WAL");
|
||||||
|
db.pragma("foreign_keys = ON");
|
||||||
|
|
||||||
|
return db;
|
||||||
|
}
|
||||||
39
data-pipeline/db/init.ts
Normal file
39
data-pipeline/db/init.ts
Normal file
|
|
@ -0,0 +1,39 @@
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import path from "node:path";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
import Database from "better-sqlite3";
|
||||||
|
|
||||||
|
// ── Paths ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
|
const PATHS = {
|
||||||
|
schema: path.join(__dirname, "schema.sql"),
|
||||||
|
db: path.join(__dirname, "pipeline.db"),
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Init ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export async function initDb(): Promise<void> {
|
||||||
|
const schema = await fs.readFile(PATHS.schema, "utf-8");
|
||||||
|
const db = new Database(PATHS.db);
|
||||||
|
|
||||||
|
db.pragma("journal_mode = WAL");
|
||||||
|
db.pragma("foreign_keys = ON");
|
||||||
|
db.exec(schema);
|
||||||
|
db.close();
|
||||||
|
|
||||||
|
console.log(` pipeline.db initialised → ${PATHS.db}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async function main(): Promise<void> {
|
||||||
|
console.log("Initialising pipeline.db...");
|
||||||
|
await initDb();
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error(err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
BIN
data-pipeline/db/pipeline.db
Normal file
BIN
data-pipeline/db/pipeline.db
Normal file
Binary file not shown.
157
data-pipeline/db/schema.sql
Normal file
157
data-pipeline/db/schema.sql
Normal file
|
|
@ -0,0 +1,157 @@
|
||||||
|
-- ── Base data ─────────────────────────────────────────────────────────────────
|
||||||
|
-- Imported from stage 2 JSON on first run. Never mutated after import.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS synsets (
|
||||||
|
source_id TEXT PRIMARY KEY,
|
||||||
|
pos TEXT NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS translations (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
word TEXT NOT NULL,
|
||||||
|
UNIQUE (source_id, language, word)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS glosses (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS examples (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
source TEXT NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS cefr_source_votes (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||||
|
cefr_level TEXT NOT NULL,
|
||||||
|
UNIQUE (translation_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ── Status tracking ───────────────────────────────────────────────────────────
|
||||||
|
-- One row per synset per model per stage. Drives resumability.
|
||||||
|
-- stage: round1 | round2 | tiebreak
|
||||||
|
-- status: pending | complete | needs_review | flagged
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS run_status (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
stage TEXT NOT NULL,
|
||||||
|
status TEXT NOT NULL,
|
||||||
|
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
|
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
|
UNIQUE (source_id, model_name, stage)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ── Round 1 output ────────────────────────────────────────────────────────────
|
||||||
|
-- One row per translation/language per model. Written atomically per record.
|
||||||
|
-- Unique constraints enforce one model one vote.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS model_cefr_votes (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
cefr_level TEXT NOT NULL,
|
||||||
|
UNIQUE (translation_id, model_name)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS generated_glosses (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
UNIQUE (source_id, model_name, language)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS generated_examples (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
UNIQUE (source_id, model_name, language)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS generated_descriptions (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
UNIQUE (source_id, model_name, language)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ── Round 2 output ────────────────────────────────────────────────────────────
|
||||||
|
-- Each row represents one model voting for one candidate.
|
||||||
|
-- The candidate with the most votes wins in merge.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS gloss_candidate_votes (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
gloss_id INTEGER NOT NULL REFERENCES generated_glosses(id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
UNIQUE (gloss_id, model_name)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS example_candidate_votes (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
example_id INTEGER NOT NULL REFERENCES generated_examples(id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
UNIQUE (example_id, model_name)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS description_candidate_votes (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
description_id INTEGER NOT NULL REFERENCES generated_descriptions(id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
UNIQUE (description_id, model_name)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ── Resolved output ───────────────────────────────────────────────────────────
|
||||||
|
-- Written by merge. Never updated after writing.
|
||||||
|
-- Only fully resolved records are written here — no nulls, no flags.
|
||||||
|
-- Absence of a row means unresolved. Flagged status tracked in run_status.
|
||||||
|
-- source: omw | cefr | model_name
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS resolved_translations (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||||
|
cefr_level TEXT NOT NULL,
|
||||||
|
difficulty TEXT NOT NULL,
|
||||||
|
UNIQUE (translation_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS resolved_glosses (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
source TEXT NOT NULL,
|
||||||
|
UNIQUE (source_id, language)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS resolved_examples (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
source TEXT NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS resolved_descriptions (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
source TEXT NOT NULL,
|
||||||
|
UNIQUE (source_id, language)
|
||||||
|
);
|
||||||
|
|
@ -3,7 +3,11 @@
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"private": true,
|
"private": true,
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"scripts": {},
|
"scripts": {
|
||||||
|
"db:init": "tsx db/init.ts",
|
||||||
|
"test": "vitest run",
|
||||||
|
"test:watch": "vitest"
|
||||||
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@lila/shared": "workspace:*",
|
"@lila/shared": "workspace:*",
|
||||||
"better-sqlite3": "^12.9.0"
|
"better-sqlite3": "^12.9.0"
|
||||||
|
|
@ -12,6 +16,7 @@
|
||||||
"@types/better-sqlite3": "^7.6.13",
|
"@types/better-sqlite3": "^7.6.13",
|
||||||
"@types/node": "^24.12.0",
|
"@types/node": "^24.12.0",
|
||||||
"tsx": "^4.21.0",
|
"tsx": "^4.21.0",
|
||||||
"typescript": "^5.9.3"
|
"typescript": "^5.9.3",
|
||||||
|
"vitest": "^4.1.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -154,7 +154,7 @@ async function loadAnnotated(): Promise<AnnotatedRecord[]> {
|
||||||
for (const [l, examples] of Object.entries(record.examples)) {
|
for (const [l, examples] of Object.entries(record.examples)) {
|
||||||
const lang = l as SupportedLanguageCode;
|
const lang = l as SupportedLanguageCode;
|
||||||
if (!base.examples[lang]) {
|
if (!base.examples[lang]) {
|
||||||
base.examples[lang] = examples as Example[];
|
base.examples[lang] = examples;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -8,5 +8,5 @@
|
||||||
"types": ["node"]
|
"types": ["node"]
|
||||||
},
|
},
|
||||||
"references": [{ "path": "../packages/shared" }],
|
"references": [{ "path": "../packages/shared" }],
|
||||||
"include": ["./**/*"]
|
"include": ["./**/*", "vitest.config.ts"]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
10
data-pipeline/vitest.config.ts
Normal file
10
data-pipeline/vitest.config.ts
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
import { defineConfig } from "vitest/config";
|
||||||
|
|
||||||
|
export default defineConfig({
|
||||||
|
test: {
|
||||||
|
environment: "node",
|
||||||
|
globals: true,
|
||||||
|
include: ["tests/**/*.test.ts"],
|
||||||
|
exclude: ["**/dist/**", "**/node_modules/**"],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
@ -528,6 +528,7 @@ llama.cpp is not installed.
|
||||||
|
|
||||||
**Next action:** Write the round 1 generation script.
|
**Next action:** Write the round 1 generation script.
|
||||||
|
|
||||||
|
- [ ] Write tests for stage 3
|
||||||
- [ ] Write round 1 script (generation)
|
- [ ] Write round 1 script (generation)
|
||||||
- [ ] Write compile-candidates script
|
- [ ] Write compile-candidates script
|
||||||
- [ ] Write round 2 script (voting)
|
- [ ] Write round 2 script (voting)
|
||||||
|
|
@ -542,24 +543,28 @@ llama.cpp is not installed.
|
||||||
|
|
||||||
### Stage 4 — Merge `🔲 not started`
|
### Stage 4 — Merge `🔲 not started`
|
||||||
|
|
||||||
|
- [ ] Write tests for stage 3
|
||||||
- [ ] Write merge script
|
- [ ] Write merge script
|
||||||
- [ ] Run merge → `pipeline.db`
|
- [ ] Run merge → `pipeline.db`
|
||||||
- [ ] Confirm tiebreaker resolves all flagged translations
|
- [ ] Confirm tiebreaker resolves all flagged translations
|
||||||
|
|
||||||
### Stage 4b — Tiebreak `🔲 not started`
|
### Stage 4b — Tiebreak `🔲 not started`
|
||||||
|
|
||||||
|
- [ ] Write tests for stage 3
|
||||||
- [ ] Write tiebreak logic
|
- [ ] Write tiebreak logic
|
||||||
- [ ] Run tiebreaker for all flagged translations
|
- [ ] Run tiebreaker for all flagged translations
|
||||||
- [ ] Confirm no flagged translations remain before seeding
|
- [ ] Confirm no flagged translations remain before seeding
|
||||||
|
|
||||||
### Stage 5 — Compare / QA `🔲 not started`
|
### Stage 5 — Compare / QA `🔲 not started`
|
||||||
|
|
||||||
|
- [ ] Write tests for stage 3
|
||||||
- [ ] Write compare script
|
- [ ] Write compare script
|
||||||
- [ ] Run compare → `COVERAGE.md`
|
- [ ] Run compare → `COVERAGE.md`
|
||||||
- [ ] Review output quality before seeding
|
- [ ] Review output quality before seeding
|
||||||
|
|
||||||
### Stage 6 — Sync `🔲 not started`
|
### Stage 6 — Sync `🔲 not started`
|
||||||
|
|
||||||
|
- [ ] Write tests for stage 3
|
||||||
- [ ] Write sync script
|
- [ ] Write sync script
|
||||||
- [ ] Configure `DATABASE_URL` in `.env`
|
- [ ] Configure `DATABASE_URL` in `.env`
|
||||||
- [ ] Run sync → production PostgreSQL
|
- [ ] Run sync → production PostgreSQL
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,6 @@ export default defineConfig([
|
||||||
"node_modules/",
|
"node_modules/",
|
||||||
"routeTree.gen.ts",
|
"routeTree.gen.ts",
|
||||||
"scripts/**",
|
"scripts/**",
|
||||||
"data-pipeline/**/*",
|
|
||||||
]),
|
]),
|
||||||
|
|
||||||
eslint.configs.recommended,
|
eslint.configs.recommended,
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@
|
||||||
"prettier --write"
|
"prettier --write"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"packageManager": "pnpm@10.33.1",
|
"packageManager": "pnpm@10.33.2",
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@eslint/js": "^10.0.1",
|
"@eslint/js": "^10.0.1",
|
||||||
"@tanstack/eslint-plugin-router": "^1.161.6",
|
"@tanstack/eslint-plugin-router": "^1.161.6",
|
||||||
|
|
|
||||||
5
pnpm-lock.yaml
generated
5
pnpm-lock.yaml
generated
|
|
@ -173,6 +173,9 @@ importers:
|
||||||
typescript:
|
typescript:
|
||||||
specifier: ^5.9.3
|
specifier: ^5.9.3
|
||||||
version: 5.9.3
|
version: 5.9.3
|
||||||
|
vitest:
|
||||||
|
specifier: ^4.1.0
|
||||||
|
version: 4.1.0(@opentelemetry/api@1.9.1)(@types/node@24.12.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3))
|
||||||
|
|
||||||
packages/db:
|
packages/db:
|
||||||
dependencies:
|
dependencies:
|
||||||
|
|
@ -4391,7 +4394,6 @@ snapshots:
|
||||||
magic-string: 0.30.21
|
magic-string: 0.30.21
|
||||||
optionalDependencies:
|
optionalDependencies:
|
||||||
vite: 8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)
|
vite: 8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)
|
||||||
optional: true
|
|
||||||
|
|
||||||
'@vitest/mocker@4.1.0(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3))':
|
'@vitest/mocker@4.1.0(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3))':
|
||||||
dependencies:
|
dependencies:
|
||||||
|
|
@ -6136,7 +6138,6 @@ snapshots:
|
||||||
jsdom: 29.0.1(@noble/hashes@2.2.0)
|
jsdom: 29.0.1(@noble/hashes@2.2.0)
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- msw
|
- msw
|
||||||
optional: true
|
|
||||||
|
|
||||||
vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)):
|
vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)):
|
||||||
dependencies:
|
dependencies:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue