diff --git a/.gitignore b/.gitignore index ad49f49..893044a 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,6 @@ data-pipeline/stage-1-extract/output/ data-pipeline/stage-2-annotate/output/ data-pipeline/stage-3-enrich/output/ data-pipeline/stage-4-merge/output/ +data-pipeline/db/pipeline.db +data-pipeline/reports/ + diff --git a/data-pipeline/db/pipeline.db b/data-pipeline/db/pipeline.db deleted file mode 100644 index e7c3bbe..0000000 Binary files a/data-pipeline/db/pipeline.db and /dev/null differ diff --git a/data-pipeline/db/pipeline.db-shm b/data-pipeline/db/pipeline.db-shm new file mode 100644 index 0000000..fe9ac28 Binary files /dev/null and b/data-pipeline/db/pipeline.db-shm differ diff --git a/data-pipeline/db/pipeline.db-wal b/data-pipeline/db/pipeline.db-wal new file mode 100644 index 0000000..e69de29 diff --git a/data-pipeline/tests/validation/db-import.validation.test.ts b/data-pipeline/tests/validation/db-import.validation.test.ts new file mode 100644 index 0000000..23c56e7 --- /dev/null +++ b/data-pipeline/tests/validation/db-import.validation.test.ts @@ -0,0 +1,237 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { describe, it, expect, beforeAll } from "vitest"; +import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; +import type { SupportedLanguageCode, SupportedPos } from "@lila/shared"; + +// ── Types ───────────────────────────────────────────────────────────────────── + +type Example = { text: string; source: "omw" | "cefr" }; + +type AnnotatedRecord = { + source_id: string; + pos: SupportedPos; + translations: Partial>; + glosses: Partial>; + examples: Partial>; + votes: Partial< + Record> + >; +}; + +// ── Paths ───────────────────────────────────────────────────────────────────── + +const DB_PATH = path.resolve("db/pipeline.db"); +const OMW_PATH = path.resolve("stage-1-extract/output/omw.json"); +const ANNOTATED_DIR = path.resolve("stage-2-annotate/output"); + +// ── Helpers ─────────────────────────────────────────────────────────────────── + +async function dbExists(): Promise { + try { + await fs.access(DB_PATH); + return true; + } catch { + return false; + } +} + +// ── Tests ───────────────────────────────────────────────────────────────────── + +describe("pipeline.db — import validation", () => { + let db: import("better-sqlite3").Database; + let expectedSynsetCount: number; + let expectedCefrVoteCount: number; + + beforeAll(async () => { + if (!(await dbExists())) return; + + const Database = (await import("better-sqlite3")).default; + db = new Database(DB_PATH, { readonly: true }); + db.pragma("foreign_keys = ON"); + + // Count expected synsets from omw.json + const omwRaw = await fs.readFile(OMW_PATH, "utf-8"); + const omwRecords = JSON.parse(omwRaw) as unknown[]; + expectedSynsetCount = omwRecords.length; + + // Count expected CEFR votes from stage 2 annotated files. + // Merge all language files the same way the import script does — + // use en.json as base and merge votes from the other language files. + const byId = new Map(); + + const baseRaw = await fs.readFile( + path.join(ANNOTATED_DIR, "en.json"), + "utf-8", + ); + const base = JSON.parse(baseRaw) as AnnotatedRecord[]; + for (const record of base) { + byId.set(record.source_id, record); + } + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + if (lang === "en") continue; + const raw = await fs.readFile( + path.join(ANNOTATED_DIR, `${lang}.json`), + "utf-8", + ); + const records = JSON.parse(raw) as AnnotatedRecord[]; + for (const record of records) { + const base = byId.get(record.source_id); + if (!base) continue; + for (const [l, langVotes] of Object.entries(record.votes)) { + if (!base.votes[l as SupportedLanguageCode]) { + base.votes[l as SupportedLanguageCode] = {}; + } + Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes); + } + } + } + + expectedCefrVoteCount = 0; + for (const record of byId.values()) { + for (const langVotes of Object.values(record.votes)) { + expectedCefrVoteCount += Object.keys(langVotes ?? {}).length; + } + } + }, 120_000); + + it("pipeline.db exists — skipping all tests if not", async () => { + const exists = await dbExists(); + if (!exists) { + console.warn( + "\n pipeline.db not found — run pnpm db:init and pnpm db:import first\n", + ); + } + expect(exists).toBe(true); + }); + + it("synsets count matches omw.json", () => { + if (!db) return; + const row = db.prepare("SELECT COUNT(*) as count FROM synsets").get() as { + count: number; + }; + expect(row.count).toBe(expectedSynsetCount); + }); + + it("every synset has at least one translation", () => { + if (!db) return; + const rows = db + .prepare( + ` + SELECT s.source_id + FROM synsets s + LEFT JOIN translations t ON t.source_id = s.source_id + WHERE t.id IS NULL + `, + ) + .all() as { source_id: string }[]; + + const errors = rows.map((r) => `${r.source_id}: no translations`); + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every translation belongs to a valid synset", () => { + if (!db) return; + const rows = db + .prepare( + ` + SELECT t.id, t.source_id + FROM translations t + LEFT JOIN synsets s ON s.source_id = t.source_id + WHERE s.source_id IS NULL + `, + ) + .all() as { id: number; source_id: string }[]; + + const errors = rows.map( + (r) => `translation ${r.id}: references missing synset ${r.source_id}`, + ); + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every cefr_source_vote references a valid translation", () => { + if (!db) return; + const rows = db + .prepare( + ` + SELECT v.id, v.translation_id + FROM cefr_source_votes v + LEFT JOIN translations t ON t.id = v.translation_id + WHERE t.id IS NULL + `, + ) + .all() as { id: number; translation_id: number }[]; + + const errors = rows.map( + (r) => + `cefr_vote ${r.id}: references missing translation ${r.translation_id}`, + ); + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("cefr_source_votes count matches stage 2 annotated output", () => { + if (!db) return; + const row = db + .prepare("SELECT COUNT(*) as count FROM cefr_source_votes") + .get() as { count: number }; + expect(row.count).toBe(expectedCefrVoteCount); + }); + + it("every example has a valid source", () => { + if (!db) return; + const rows = db + .prepare( + ` + SELECT source_id, language, source + FROM examples + WHERE source NOT IN ('omw', 'cefr') + `, + ) + .all() as { source_id: string; language: string; source: string }[]; + + const errors = rows.map( + (r) => + `${r.source_id} (${r.language}): invalid example source "${r.source}"`, + ); + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every example belongs to a valid synset", () => { + if (!db) return; + const rows = db + .prepare( + ` + SELECT e.id, e.source_id + FROM examples e + LEFT JOIN synsets s ON s.source_id = e.source_id + WHERE s.source_id IS NULL + `, + ) + .all() as { id: number; source_id: string }[]; + + const errors = rows.map( + (r) => `example ${r.id}: references missing synset ${r.source_id}`, + ); + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every gloss belongs to a valid synset", () => { + if (!db) return; + const rows = db + .prepare( + ` + SELECT g.id, g.source_id + FROM glosses g + LEFT JOIN synsets s ON s.source_id = g.source_id + WHERE s.source_id IS NULL + `, + ) + .all() as { id: number; source_id: string }[]; + + const errors = rows.map( + (r) => `gloss ${r.id}: references missing synset ${r.source_id}`, + ); + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); +});