feat: add Kaikki extraction and import scripts for stage 1

- Add stage-1-extract/scripts/extract.ts — streams Kaikki JSONL, filters to supported POS and languages, skips abbreviations and senses with no translations in supported languages - Rewrite db/import.ts for Kaikki flat model — tracks sense_index offsets per headword+pos to handle duplicate JSONL entries - Rewrite db/schema.sql for Kaikki model — entries, translations, LLM vote tables, resolved tables - Add extract and db:import scripts to package.json - Sample mode hardcoded to 500 entries for development
2026-05-05 18:11:53 +02:00 · 2026-05-05 18:11:53 +02:00 · 209d52f54b
commit 209d52f54b
parent 963bff4eb8
17 changed files with 346 additions and 1055737 deletions
--- a/data-pipeline/tests/validation/db-import.validation.test.ts
+++ b/data-pipeline/tests/validation/db-import.validation.test.ts
@ -1,237 +0,0 @@
-import fs from "node:fs/promises";
-import path from "node:path";
-import { describe, it, expect, beforeAll } from "vitest";
-import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
-import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
-
-// ── Types ─────────────────────────────────────────────────────────────────────
-
-type Example = { text: string; source: "omw" | "cefr" };
-
-type AnnotatedRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
-  votes: Partial<
-    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
-  >;
-};
-
-// ── Paths ─────────────────────────────────────────────────────────────────────
-
-const DB_PATH = path.resolve("db/pipeline.db");
-const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
-const ANNOTATED_DIR = path.resolve("stage-2-annotate/output");
-
-// ── Helpers ───────────────────────────────────────────────────────────────────
-
-async function dbExists(): Promise<boolean> {
-  try {
-    await fs.access(DB_PATH);
-    return true;
-  } catch {
-    return false;
-  }
-}
-
-// ── Tests ─────────────────────────────────────────────────────────────────────
-
-describe("pipeline.db — import validation", () => {
-  let db: import("better-sqlite3").Database;
-  let expectedSynsetCount: number;
-  let expectedCefrVoteCount: number;
-
-  beforeAll(async () => {
-    if (!(await dbExists())) return;
-
-    const Database = (await import("better-sqlite3")).default;
-    db = new Database(DB_PATH, { readonly: true });
-    db.pragma("foreign_keys = ON");
-
-    // Count expected synsets from omw.json
-    const omwRaw = await fs.readFile(OMW_PATH, "utf-8");
-    const omwRecords = JSON.parse(omwRaw) as unknown[];
-    expectedSynsetCount = omwRecords.length;
-
-    // Count expected CEFR votes from stage 2 annotated files.
-    // Merge all language files the same way the import script does —
-    // use en.json as base and merge votes from the other language files.
-    const byId = new Map<string, AnnotatedRecord>();
-
-    const baseRaw = await fs.readFile(
-      path.join(ANNOTATED_DIR, "en.json"),
-      "utf-8",
-    );
-    const base = JSON.parse(baseRaw) as AnnotatedRecord[];
-    for (const record of base) {
-      byId.set(record.source_id, record);
-    }
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      if (lang === "en") continue;
-      const raw = await fs.readFile(
-        path.join(ANNOTATED_DIR, `${lang}.json`),
-        "utf-8",
-      );
-      const records = JSON.parse(raw) as AnnotatedRecord[];
-      for (const record of records) {
-        const base = byId.get(record.source_id);
-        if (!base) continue;
-        for (const [l, langVotes] of Object.entries(record.votes)) {
-          if (!base.votes[l as SupportedLanguageCode]) {
-            base.votes[l as SupportedLanguageCode] = {};
-          }
-          Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
-        }
-      }
-    }
-
-    expectedCefrVoteCount = 0;
-    for (const record of byId.values()) {
-      for (const langVotes of Object.values(record.votes)) {
-        expectedCefrVoteCount += Object.keys(langVotes ?? {}).length;
-      }
-    }
-  }, 120_000);
-
-  it("pipeline.db exists — skipping all tests if not", async () => {
-    const exists = await dbExists();
-    if (!exists) {
-      console.warn(
-        "\n  pipeline.db not found — run pnpm db:init and pnpm db:import first\n",
-      );
-    }
-    expect(exists).toBe(true);
-  });
-
-  it("synsets count matches omw.json", () => {
-    if (!db) return;
-    const row = db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
-      count: number;
-    };
-    expect(row.count).toBe(expectedSynsetCount);
-  });
-
-  it("every synset has at least one translation", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT s.source_id
-        FROM synsets s
-        LEFT JOIN translations t ON t.source_id = s.source_id
-        WHERE t.id IS NULL
-      `,
-      )
-      .all() as { source_id: string }[];
-
-    const errors = rows.map((r) => `${r.source_id}: no translations`);
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every translation belongs to a valid synset", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT t.id, t.source_id
-        FROM translations t
-        LEFT JOIN synsets s ON s.source_id = t.source_id
-        WHERE s.source_id IS NULL
-      `,
-      )
-      .all() as { id: number; source_id: string }[];
-
-    const errors = rows.map(
-      (r) => `translation ${r.id}: references missing synset ${r.source_id}`,
-    );
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every cefr_source_vote references a valid translation", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT v.id, v.translation_id
-        FROM cefr_source_votes v
-        LEFT JOIN translations t ON t.id = v.translation_id
-        WHERE t.id IS NULL
-      `,
-      )
-      .all() as { id: number; translation_id: number }[];
-
-    const errors = rows.map(
-      (r) =>
-        `cefr_vote ${r.id}: references missing translation ${r.translation_id}`,
-    );
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("cefr_source_votes count matches stage 2 annotated output", () => {
-    if (!db) return;
-    const row = db
-      .prepare("SELECT COUNT(*) as count FROM cefr_source_votes")
-      .get() as { count: number };
-    expect(row.count).toBe(expectedCefrVoteCount);
-  });
-
-  it("every example has a valid source", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT source_id, language, source
-        FROM examples
-        WHERE source NOT IN ('omw', 'cefr')
-      `,
-      )
-      .all() as { source_id: string; language: string; source: string }[];
-
-    const errors = rows.map(
-      (r) =>
-        `${r.source_id} (${r.language}): invalid example source "${r.source}"`,
-    );
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every example belongs to a valid synset", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT e.id, e.source_id
-        FROM examples e
-        LEFT JOIN synsets s ON s.source_id = e.source_id
-        WHERE s.source_id IS NULL
-      `,
-      )
-      .all() as { id: number; source_id: string }[];
-
-    const errors = rows.map(
-      (r) => `example ${r.id}: references missing synset ${r.source_id}`,
-    );
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every gloss belongs to a valid synset", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT g.id, g.source_id
-        FROM glosses g
-        LEFT JOIN synsets s ON s.source_id = g.source_id
-        WHERE s.source_id IS NULL
-      `,
-      )
-      .all() as { id: number; source_id: string }[];
-
-    const errors = rows.map(
-      (r) => `gloss ${r.id}: references missing synset ${r.source_id}`,
-    );
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-});
--- a/data-pipeline/tests/validation/stage-1.validation.test.ts
+++ b/data-pipeline/tests/validation/stage-1.validation.test.ts
@ -1,166 +0,0 @@
-import fs from "node:fs/promises";
-import path from "node:path";
-import { describe, it, expect } from "vitest";
-import { SUPPORTED_POS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
-import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
-
-// ── Types ─────────────────────────────────────────────────────────────────────
-
-type OmwRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, string[]>>;
-};
-
-// ── Paths ─────────────────────────────────────────────────────────────────────
-
-const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
-
-// ── Helpers ───────────────────────────────────────────────────────────────────
-
-function isValidSourceId(id: string): boolean {
-  return /^ili:i\d+$/.test(id);
-}
-
-// ── Tests ─────────────────────────────────────────────────────────────────────
-
-describe("stage 1 — omw.json validation", () => {
-  let records: OmwRecord[];
-
-  it("file exists and is valid JSON", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-    expect(records).toBeDefined();
-  });
-
-  it("is a non-empty array", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-    expect(Array.isArray(records)).toBe(true);
-    expect(records.length).toBeGreaterThan(0);
-  });
-
-  it("every record has required fields", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-
-    const errors: string[] = [];
-
-    for (const record of records) {
-      if (!record.source_id) {
-        errors.push(`missing source_id`);
-        continue;
-      }
-      if (!record.pos) errors.push(`${record.source_id}: missing pos`);
-      if (!record.translations)
-        errors.push(`${record.source_id}: missing translations`);
-      if (!record.glosses) errors.push(`${record.source_id}: missing glosses`);
-      if (!record.examples)
-        errors.push(`${record.source_id}: missing examples`);
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every source_id matches ili:i{number} pattern", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-
-    const errors: string[] = [];
-
-    for (const record of records) {
-      if (!isValidSourceId(record.source_id)) {
-        errors.push(`invalid source_id: ${record.source_id}`);
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every source_id is unique", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-
-    const seen = new Set<string>();
-    const errors: string[] = [];
-
-    for (const record of records) {
-      if (seen.has(record.source_id)) {
-        errors.push(`duplicate source_id: ${record.source_id}`);
-      }
-      seen.add(record.source_id);
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every pos is a valid supported value", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-
-    const errors: string[] = [];
-    const validPos = new Set(SUPPORTED_POS);
-
-    for (const record of records) {
-      if (!validPos.has(record.pos)) {
-        errors.push(`${record.source_id}: invalid pos "${record.pos}"`);
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every record has at least one translation in at least one language", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-
-    const errors: string[] = [];
-    const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
-
-    for (const record of records) {
-      const langs = Object.keys(record.translations) as SupportedLanguageCode[];
-
-      if (langs.length === 0) {
-        errors.push(`${record.source_id}: no translations`);
-        continue;
-      }
-
-      for (const lang of langs) {
-        if (!validLangs.has(lang)) {
-          errors.push(`${record.source_id}: unsupported language "${lang}"`);
-        }
-        const words = record.translations[lang] ?? [];
-        if (words.length === 0) {
-          errors.push(`${record.source_id}: empty translations for "${lang}"`);
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("no duplicate translations within a single synset and language", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    const records = JSON.parse(raw) as OmwRecord[];
-
-    const errors: string[] = [];
-
-    for (const record of records) {
-      for (const [lang, words] of Object.entries(record.translations)) {
-        const seen = new Set<string>();
-        for (const word of words) {
-          if (seen.has(word)) {
-            errors.push(
-              `${record.source_id} (${lang}): duplicate translation "${word}"`,
-            );
-          }
-          seen.add(word);
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-});
--- a/data-pipeline/tests/validation/stage-2.validation.test.ts
+++ b/data-pipeline/tests/validation/stage-2.validation.test.ts
@ -1,218 +0,0 @@
-import fs from "node:fs/promises";
-import path from "node:path";
-import { describe, it, expect, beforeAll } from "vitest";
-import {
-  SUPPORTED_POS,
-  SUPPORTED_LANGUAGE_CODES,
-  CEFR_LEVELS,
-} from "@lila/shared";
-import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
-
-// ── Types ─────────────────────────────────────────────────────────────────────
-
-type Example = { text: string; source: "omw" | "cefr" };
-
-type AnnotatedRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
-  votes: Partial<
-    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
-  >;
-};
-
-type ConflictEntry = {
-  word: string;
-  pos: string;
-  language: SupportedLanguageCode;
-  levels: string[];
-};
-
-// ── Paths ─────────────────────────────────────────────────────────────────────
-
-const OUTPUT_DIR = path.resolve("stage-2-annotate/output");
-
-// ── Tests ─────────────────────────────────────────────────────────────────────
-
-describe("stage 2 — annotated output validation", () => {
-  const recordsByLang = new Map<SupportedLanguageCode, AnnotatedRecord[]>();
-  let conflicts: ConflictEntry[] = [];
-
-  beforeAll(async () => {
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const raw = await fs.readFile(
-        path.join(OUTPUT_DIR, `${lang}.json`),
-        "utf-8",
-      );
-      recordsByLang.set(lang, JSON.parse(raw) as AnnotatedRecord[]);
-    }
-    const raw = await fs.readFile(
-      path.join(OUTPUT_DIR, "conflicts.json"),
-      "utf-8",
-    );
-    conflicts = JSON.parse(raw) as ConflictEntry[];
-  }, 60_000);
-
-  it("all five language files exist", async () => {
-    const errors: string[] = [];
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
-      try {
-        await fs.access(filePath);
-      } catch {
-        errors.push(`missing file: ${lang}.json`);
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("conflicts.json exists", async () => {
-    const filePath = path.join(OUTPUT_DIR, "conflicts.json");
-    await expect(fs.access(filePath)).resolves.toBeUndefined();
-  });
-
-  it("every language file is a non-empty array", () => {
-    const errors: string[] = [];
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-      if (!Array.isArray(records)) {
-        errors.push(`${lang}.json: not an array`);
-      } else if (records.length === 0) {
-        errors.push(`${lang}.json: empty array`);
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every record has required fields", () => {
-    const errors: string[] = [];
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-
-      for (const record of records) {
-        if (!record.source_id) {
-          errors.push(`${lang}: record missing source_id`);
-          continue;
-        }
-        if (!record.pos)
-          errors.push(`${lang} ${record.source_id}: missing pos`);
-        if (!record.translations)
-          errors.push(`${lang} ${record.source_id}: missing translations`);
-        if (!record.glosses)
-          errors.push(`${lang} ${record.source_id}: missing glosses`);
-        if (record.examples === undefined)
-          errors.push(`${lang} ${record.source_id}: missing examples`);
-        if (record.votes === undefined)
-          errors.push(`${lang} ${record.source_id}: missing votes`);
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every pos is a valid supported value", () => {
-    const errors: string[] = [];
-    const validPos = new Set(SUPPORTED_POS);
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-
-      for (const record of records) {
-        if (!validPos.has(record.pos)) {
-          errors.push(
-            `${lang} ${record.source_id}: invalid pos "${record.pos}"`,
-          );
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every example has text and a valid source", () => {
-    const errors: string[] = [];
-    const validSources = new Set(["omw", "cefr"]);
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-
-      for (const record of records) {
-        for (const [l, examples] of Object.entries(record.examples)) {
-          for (const example of examples) {
-            if (!example.text) {
-              errors.push(
-                `${lang} ${record.source_id} (${l}): example missing text`,
-              );
-            }
-            if (!validSources.has(example.source)) {
-              errors.push(
-                `${lang} ${record.source_id} (${l}): invalid example source "${example.source}"`,
-              );
-            }
-          }
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every cefr_source vote is a valid CEFR level", () => {
-    const errors: string[] = [];
-    const validLevels = new Set(CEFR_LEVELS);
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-
-      for (const record of records) {
-        for (const [l, langVotes] of Object.entries(record.votes)) {
-          for (const [word, vote] of Object.entries(langVotes ?? {})) {
-            if (
-              !validLevels.has(vote.cefr_source as (typeof CEFR_LEVELS)[number])
-            ) {
-              errors.push(
-                `${lang} ${record.source_id} (${l} — "${word}"): invalid cefr_source "${vote.cefr_source}"`,
-              );
-            }
-          }
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("conflicts.json entries have required fields and valid CEFR levels", () => {
-    const errors: string[] = [];
-    const validLevels = new Set(CEFR_LEVELS);
-    const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
-
-    for (const entry of conflicts) {
-      if (!entry.word) errors.push(`conflict missing word`);
-      if (!entry.pos) errors.push(`conflict missing pos`);
-      if (!entry.language) {
-        errors.push(`conflict missing language`);
-      } else if (!validLangs.has(entry.language)) {
-        errors.push(`conflict invalid language "${entry.language}"`);
-      }
-      if (!Array.isArray(entry.levels) || entry.levels.length < 2) {
-        errors.push(`${entry.word}: levels must have at least 2 entries`);
-      } else {
-        for (const level of entry.levels) {
-          if (!validLevels.has(level as (typeof CEFR_LEVELS)[number])) {
-            errors.push(`${entry.word}: invalid level "${level}"`);
-          }
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-});