removing db from git tracking, adding it to gitignore, add db import validation tests
This commit is contained in:
parent
f59399be02
commit
4d42fe4397
5 changed files with 240 additions and 0 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -15,3 +15,6 @@ data-pipeline/stage-1-extract/output/
|
||||||
data-pipeline/stage-2-annotate/output/
|
data-pipeline/stage-2-annotate/output/
|
||||||
data-pipeline/stage-3-enrich/output/
|
data-pipeline/stage-3-enrich/output/
|
||||||
data-pipeline/stage-4-merge/output/
|
data-pipeline/stage-4-merge/output/
|
||||||
|
data-pipeline/db/pipeline.db
|
||||||
|
data-pipeline/reports/
|
||||||
|
|
||||||
|
|
|
||||||
Binary file not shown.
BIN
data-pipeline/db/pipeline.db-shm
Normal file
BIN
data-pipeline/db/pipeline.db-shm
Normal file
Binary file not shown.
0
data-pipeline/db/pipeline.db-wal
Normal file
0
data-pipeline/db/pipeline.db-wal
Normal file
237
data-pipeline/tests/validation/db-import.validation.test.ts
Normal file
237
data-pipeline/tests/validation/db-import.validation.test.ts
Normal file
|
|
@ -0,0 +1,237 @@
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import path from "node:path";
|
||||||
|
import { describe, it, expect, beforeAll } from "vitest";
|
||||||
|
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||||
|
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
||||||
|
|
||||||
|
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
type Example = { text: string; source: "omw" | "cefr" };
|
||||||
|
|
||||||
|
type AnnotatedRecord = {
|
||||||
|
source_id: string;
|
||||||
|
pos: SupportedPos;
|
||||||
|
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||||
|
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||||
|
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
||||||
|
votes: Partial<
|
||||||
|
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
||||||
|
>;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Paths ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const DB_PATH = path.resolve("db/pipeline.db");
|
||||||
|
const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
|
||||||
|
const ANNOTATED_DIR = path.resolve("stage-2-annotate/output");
|
||||||
|
|
||||||
|
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async function dbExists(): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
await fs.access(DB_PATH);
|
||||||
|
return true;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
describe("pipeline.db — import validation", () => {
|
||||||
|
let db: import("better-sqlite3").Database;
|
||||||
|
let expectedSynsetCount: number;
|
||||||
|
let expectedCefrVoteCount: number;
|
||||||
|
|
||||||
|
beforeAll(async () => {
|
||||||
|
if (!(await dbExists())) return;
|
||||||
|
|
||||||
|
const Database = (await import("better-sqlite3")).default;
|
||||||
|
db = new Database(DB_PATH, { readonly: true });
|
||||||
|
db.pragma("foreign_keys = ON");
|
||||||
|
|
||||||
|
// Count expected synsets from omw.json
|
||||||
|
const omwRaw = await fs.readFile(OMW_PATH, "utf-8");
|
||||||
|
const omwRecords = JSON.parse(omwRaw) as unknown[];
|
||||||
|
expectedSynsetCount = omwRecords.length;
|
||||||
|
|
||||||
|
// Count expected CEFR votes from stage 2 annotated files.
|
||||||
|
// Merge all language files the same way the import script does —
|
||||||
|
// use en.json as base and merge votes from the other language files.
|
||||||
|
const byId = new Map<string, AnnotatedRecord>();
|
||||||
|
|
||||||
|
const baseRaw = await fs.readFile(
|
||||||
|
path.join(ANNOTATED_DIR, "en.json"),
|
||||||
|
"utf-8",
|
||||||
|
);
|
||||||
|
const base = JSON.parse(baseRaw) as AnnotatedRecord[];
|
||||||
|
for (const record of base) {
|
||||||
|
byId.set(record.source_id, record);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
if (lang === "en") continue;
|
||||||
|
const raw = await fs.readFile(
|
||||||
|
path.join(ANNOTATED_DIR, `${lang}.json`),
|
||||||
|
"utf-8",
|
||||||
|
);
|
||||||
|
const records = JSON.parse(raw) as AnnotatedRecord[];
|
||||||
|
for (const record of records) {
|
||||||
|
const base = byId.get(record.source_id);
|
||||||
|
if (!base) continue;
|
||||||
|
for (const [l, langVotes] of Object.entries(record.votes)) {
|
||||||
|
if (!base.votes[l as SupportedLanguageCode]) {
|
||||||
|
base.votes[l as SupportedLanguageCode] = {};
|
||||||
|
}
|
||||||
|
Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedCefrVoteCount = 0;
|
||||||
|
for (const record of byId.values()) {
|
||||||
|
for (const langVotes of Object.values(record.votes)) {
|
||||||
|
expectedCefrVoteCount += Object.keys(langVotes ?? {}).length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 120_000);
|
||||||
|
|
||||||
|
it("pipeline.db exists — skipping all tests if not", async () => {
|
||||||
|
const exists = await dbExists();
|
||||||
|
if (!exists) {
|
||||||
|
console.warn(
|
||||||
|
"\n pipeline.db not found — run pnpm db:init and pnpm db:import first\n",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
expect(exists).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("synsets count matches omw.json", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const row = db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
|
||||||
|
count: number;
|
||||||
|
};
|
||||||
|
expect(row.count).toBe(expectedSynsetCount);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every synset has at least one translation", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const rows = db
|
||||||
|
.prepare(
|
||||||
|
`
|
||||||
|
SELECT s.source_id
|
||||||
|
FROM synsets s
|
||||||
|
LEFT JOIN translations t ON t.source_id = s.source_id
|
||||||
|
WHERE t.id IS NULL
|
||||||
|
`,
|
||||||
|
)
|
||||||
|
.all() as { source_id: string }[];
|
||||||
|
|
||||||
|
const errors = rows.map((r) => `${r.source_id}: no translations`);
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every translation belongs to a valid synset", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const rows = db
|
||||||
|
.prepare(
|
||||||
|
`
|
||||||
|
SELECT t.id, t.source_id
|
||||||
|
FROM translations t
|
||||||
|
LEFT JOIN synsets s ON s.source_id = t.source_id
|
||||||
|
WHERE s.source_id IS NULL
|
||||||
|
`,
|
||||||
|
)
|
||||||
|
.all() as { id: number; source_id: string }[];
|
||||||
|
|
||||||
|
const errors = rows.map(
|
||||||
|
(r) => `translation ${r.id}: references missing synset ${r.source_id}`,
|
||||||
|
);
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every cefr_source_vote references a valid translation", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const rows = db
|
||||||
|
.prepare(
|
||||||
|
`
|
||||||
|
SELECT v.id, v.translation_id
|
||||||
|
FROM cefr_source_votes v
|
||||||
|
LEFT JOIN translations t ON t.id = v.translation_id
|
||||||
|
WHERE t.id IS NULL
|
||||||
|
`,
|
||||||
|
)
|
||||||
|
.all() as { id: number; translation_id: number }[];
|
||||||
|
|
||||||
|
const errors = rows.map(
|
||||||
|
(r) =>
|
||||||
|
`cefr_vote ${r.id}: references missing translation ${r.translation_id}`,
|
||||||
|
);
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("cefr_source_votes count matches stage 2 annotated output", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const row = db
|
||||||
|
.prepare("SELECT COUNT(*) as count FROM cefr_source_votes")
|
||||||
|
.get() as { count: number };
|
||||||
|
expect(row.count).toBe(expectedCefrVoteCount);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every example has a valid source", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const rows = db
|
||||||
|
.prepare(
|
||||||
|
`
|
||||||
|
SELECT source_id, language, source
|
||||||
|
FROM examples
|
||||||
|
WHERE source NOT IN ('omw', 'cefr')
|
||||||
|
`,
|
||||||
|
)
|
||||||
|
.all() as { source_id: string; language: string; source: string }[];
|
||||||
|
|
||||||
|
const errors = rows.map(
|
||||||
|
(r) =>
|
||||||
|
`${r.source_id} (${r.language}): invalid example source "${r.source}"`,
|
||||||
|
);
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every example belongs to a valid synset", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const rows = db
|
||||||
|
.prepare(
|
||||||
|
`
|
||||||
|
SELECT e.id, e.source_id
|
||||||
|
FROM examples e
|
||||||
|
LEFT JOIN synsets s ON s.source_id = e.source_id
|
||||||
|
WHERE s.source_id IS NULL
|
||||||
|
`,
|
||||||
|
)
|
||||||
|
.all() as { id: number; source_id: string }[];
|
||||||
|
|
||||||
|
const errors = rows.map(
|
||||||
|
(r) => `example ${r.id}: references missing synset ${r.source_id}`,
|
||||||
|
);
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every gloss belongs to a valid synset", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const rows = db
|
||||||
|
.prepare(
|
||||||
|
`
|
||||||
|
SELECT g.id, g.source_id
|
||||||
|
FROM glosses g
|
||||||
|
LEFT JOIN synsets s ON s.source_id = g.source_id
|
||||||
|
WHERE s.source_id IS NULL
|
||||||
|
`,
|
||||||
|
)
|
||||||
|
.all() as { id: number; source_id: string }[];
|
||||||
|
|
||||||
|
const errors = rows.map(
|
||||||
|
(r) => `gloss ${r.id}: references missing synset ${r.source_id}`,
|
||||||
|
);
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
Loading…
Add table
Add a link
Reference in a new issue