import fs from "node:fs/promises"; import path from "node:path"; import { fileURLToPath } from "node:url"; import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; import { openDb } from "./index.js"; import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js"; // ── Paths ───────────────────────────────────────────────────────────────────── const __dirname = path.dirname(fileURLToPath(import.meta.url)); const OUTPUT_DIR = path.resolve(__dirname, "../stage-1-extract/output"); // ── Import ──────────────────────────────────────────────────────────────────── export async function importKaikki(): Promise { const db = openDb(); const insertEntry = db.prepare(` INSERT INTO entries (headword, language, pos, sense_index, gloss, examples) VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT (headword, language, pos, sense_index) DO UPDATE SET gloss = excluded.gloss, examples = excluded.examples RETURNING id `); const insertTranslation = db.prepare(` INSERT INTO translations (entry_id, target_lang, word, sense_hint) VALUES (?, ?, ?, ?) ON CONFLICT (entry_id, target_lang, word) DO NOTHING `); let totalEntries = 0; let totalTranslations = 0; let totalSkipped = 0; for (const lang of SUPPORTED_LANGUAGE_CODES) { const filePath = path.join(OUTPUT_DIR, `${lang}.json`); let senses: ExtractedSense[]; try { const raw = await fs.readFile(filePath, "utf-8"); senses = JSON.parse(raw) as ExtractedSense[]; } catch { console.warn(` Warning: no output file found for ${lang}, skipping`); continue; } console.log( ` Importing ${lang}: ${senses.length.toLocaleString()} senses...`, ); // Track next available sense_index per (headword, pos) to handle // the same word appearing in multiple JSONL entries with the same POS. const senseIndexMap = new Map(); const importLang = db.transaction(() => { let entries = 0; let translations = 0; let skipped = 0; for (const sense of senses) { const key = `${sense.headword}|${sense.pos}`; const nextIndex = senseIndexMap.get(key) ?? 0; senseIndexMap.set(key, nextIndex + 1); const row = insertEntry.get( sense.headword, sense.language, sense.pos, nextIndex, sense.gloss ?? null, JSON.stringify(sense.examples), ) as { id: number } | undefined; if (!row) { skipped++; continue; } entries++; for (const t of sense.translations) { insertTranslation.run( row.id, t.target_lang, t.word, t.sense_hint ?? null, ); translations++; } } return { entries, translations, skipped }; }); const counts = importLang(); totalEntries += counts.entries; totalTranslations += counts.translations; totalSkipped += counts.skipped; console.log( ` entries: ${counts.entries.toLocaleString()}, translations: ${counts.translations.toLocaleString()}, skipped: ${counts.skipped.toLocaleString()}`, ); } db.close(); console.log(`\nImport complete:`); console.log(` Total entries: ${totalEntries.toLocaleString()}`); console.log(` Total translations: ${totalTranslations.toLocaleString()}`); console.log(` Total skipped: ${totalSkipped.toLocaleString()}`); } // ── Check if already imported ───────────────────────────────────────────────── export function isImported(): boolean { const db = openDb(); const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as { count: number; }; db.close(); return row.count > 0; } // ── Main ───────────────────────────────────────────────────────────────────── async function main(): Promise { const db = openDb(); const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as { count: number; }; db.close(); if (row.count > 0) { console.log( `pipeline.db already contains ${row.count.toLocaleString()} entries — skipping import.`, ); console.log("Delete pipeline.db and re-run db:init to start fresh."); process.exit(0); } console.log("Importing Kaikki data into pipeline.db..."); await importKaikki(); } if (import.meta.url === `file://${process.argv[1]}`) { main().catch((err) => { console.error(err); process.exit(1); }); }