feat: update extractor for all 5 languages, update import for multi-language

- Extract.ts now processes all 5 language files, filters non-English entries by lang_code, skips translation extraction for non-English (no translations in source files) - Import.ts now imports all 5 language output files, uses language field from ExtractedSense instead of hardcoding en - Sample limit hardcoded to 500 entries per language for development
2026-05-05 18:46:32 +02:00 · 2026-05-05 18:46:32 +02:00 · 0cc643e308
commit 0cc643e308
parent 209d52f54b
3 changed files with 173 additions and 107 deletions
--- a/data-pipeline/db/import.ts
+++ b/data-pipeline/db/import.ts
@ -1,6 +1,7 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { fileURLToPath } from "node:url";
+import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
 import { openDb } from "./index.js";
 import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";

@ -8,18 +9,11 @@ import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";

 const __dirname = path.dirname(fileURLToPath(import.meta.url));

-const PATHS = {
-  extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
-};
+const OUTPUT_DIR = path.resolve(__dirname, "../stage-1-extract/output");

 // ── Import ────────────────────────────────────────────────────────────────────

 export async function importKaikki(): Promise<void> {
-  console.log("Loading extracted Kaikki data...");
-  const raw = await fs.readFile(PATHS.extracted, "utf-8");
-  const senses = JSON.parse(raw) as ExtractedSense[];
-  console.log(`  Loaded ${senses.length.toLocaleString()} senses`);
-
  const db = openDb();

  const insertEntry = db.prepare(`
@ -38,64 +32,86 @@ export async function importKaikki(): Promise<void> {
    ON CONFLICT (entry_id, target_lang, word) DO NOTHING
  `);

-  // Track next available sense_index per (headword, pos) to handle
-  // the same word appearing in multiple JSONL entries with the same POS.
-  const senseIndexMap = new Map<string, number>();
+  let totalEntries = 0;
+  let totalTranslations = 0;
+  let totalSkipped = 0;

-  console.log("\nImporting into pipeline.db...");
+  for (const lang of SUPPORTED_LANGUAGE_CODES) {
+    const filePath = path.join(OUTPUT_DIR, `${lang}.json`);

-  const importAll = db.transaction(() => {
-    let entries = 0;
-    let translations = 0;
-    let skipped = 0;
-
-    for (const sense of senses) {
-      const key = `${sense.headword}|${sense.pos}`;
-      const nextIndex = senseIndexMap.get(key) ?? 0;
-
-      // Use the offset sense_index to avoid collisions when the same word
-      // appears in multiple JSONL entries with the same POS.
-      const senseIndex = nextIndex;
-      senseIndexMap.set(key, nextIndex + 1);
-
-      const row = insertEntry.get(
-        sense.headword,
-        "en",
-        sense.pos,
-        senseIndex,
-        sense.gloss ?? null,
-        JSON.stringify(sense.examples),
-      ) as { id: number } | undefined;
-
-      if (!row) {
-        skipped++;
-        continue;
-      }
-
-      entries++;
-
-      for (const t of sense.translations) {
-        insertTranslation.run(
-          row.id,
-          t.target_lang,
-          t.word,
-          t.sense_hint ?? null,
-        );
-        translations++;
-      }
+    let senses: ExtractedSense[];
+    try {
+      const raw = await fs.readFile(filePath, "utf-8");
+      senses = JSON.parse(raw) as ExtractedSense[];
+    } catch {
+      console.warn(`  Warning: no output file found for ${lang}, skipping`);
+      continue;
    }

-    return { entries, translations, skipped };
-  });
+    console.log(
+      `  Importing ${lang}: ${senses.length.toLocaleString()} senses...`,
+    );

-  const counts = importAll();
+    // Track next available sense_index per (headword, pos) to handle
+    // the same word appearing in multiple JSONL entries with the same POS.
+    const senseIndexMap = new Map<string, number>();

-  console.log(`  entries:      ${counts.entries.toLocaleString()}`);
-  console.log(`  translations: ${counts.translations.toLocaleString()}`);
-  console.log(`  skipped:      ${counts.skipped.toLocaleString()}`);
+    const importLang = db.transaction(() => {
+      let entries = 0;
+      let translations = 0;
+      let skipped = 0;
+
+      for (const sense of senses) {
+        const key = `${sense.headword}|${sense.pos}`;
+        const nextIndex = senseIndexMap.get(key) ?? 0;
+        senseIndexMap.set(key, nextIndex + 1);
+
+        const row = insertEntry.get(
+          sense.headword,
+          sense.language,
+          sense.pos,
+          nextIndex,
+          sense.gloss ?? null,
+          JSON.stringify(sense.examples),
+        ) as { id: number } | undefined;
+
+        if (!row) {
+          skipped++;
+          continue;
+        }
+
+        entries++;
+
+        for (const t of sense.translations) {
+          insertTranslation.run(
+            row.id,
+            t.target_lang,
+            t.word,
+            t.sense_hint ?? null,
+          );
+          translations++;
+        }
+      }
+
+      return { entries, translations, skipped };
+    });
+
+    const counts = importLang();
+    totalEntries += counts.entries;
+    totalTranslations += counts.translations;
+    totalSkipped += counts.skipped;
+
+    console.log(
+      `    entries: ${counts.entries.toLocaleString()}, translations: ${counts.translations.toLocaleString()}, skipped: ${counts.skipped.toLocaleString()}`,
+    );
+  }

  db.close();
-  console.log("\nImport complete.");
+
+  console.log(`\nImport complete:`);
+  console.log(`  Total entries:      ${totalEntries.toLocaleString()}`);
+  console.log(`  Total translations: ${totalTranslations.toLocaleString()}`);
+  console.log(`  Total skipped:      ${totalSkipped.toLocaleString()}`);
 }

 // ── Check if already imported ─────────────────────────────────────────────────
@ -126,6 +142,7 @@ async function main(): Promise<void> {
    process.exit(0);
  }

+  console.log("Importing Kaikki data into pipeline.db...");
  await importKaikki();
 }