import fs from "node:fs/promises"; import path from "node:path"; import { fileURLToPath } from "node:url"; import { openDb } from "./index.js"; import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js"; // ── Paths ───────────────────────────────────────────────────────────────────── const __dirname = path.dirname(fileURLToPath(import.meta.url)); const PATHS = { extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"), }; // ── Import ──────────────────────────────────────────────────────────────────── export async function importKaikki(): Promise { console.log("Loading extracted Kaikki data..."); const raw = await fs.readFile(PATHS.extracted, "utf-8"); const senses = JSON.parse(raw) as ExtractedSense[]; console.log(` Loaded ${senses.length.toLocaleString()} senses`); const db = openDb(); const insertEntry = db.prepare(` INSERT INTO entries (headword, language, pos, sense_index, gloss, examples) VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT (headword, language, pos, sense_index) DO UPDATE SET gloss = excluded.gloss, examples = excluded.examples RETURNING id `); const insertTranslation = db.prepare(` INSERT INTO translations (entry_id, target_lang, word, sense_hint) VALUES (?, ?, ?, ?) ON CONFLICT (entry_id, target_lang, word) DO NOTHING `); // Track next available sense_index per (headword, pos) to handle // the same word appearing in multiple JSONL entries with the same POS. const senseIndexMap = new Map(); console.log("\nImporting into pipeline.db..."); const importAll = db.transaction(() => { let entries = 0; let translations = 0; let skipped = 0; for (const sense of senses) { const key = `${sense.headword}|${sense.pos}`; const nextIndex = senseIndexMap.get(key) ?? 0; // Use the offset sense_index to avoid collisions when the same word // appears in multiple JSONL entries with the same POS. const senseIndex = nextIndex; senseIndexMap.set(key, nextIndex + 1); const row = insertEntry.get( sense.headword, "en", sense.pos, senseIndex, sense.gloss ?? null, JSON.stringify(sense.examples), ) as { id: number } | undefined; if (!row) { skipped++; continue; } entries++; for (const t of sense.translations) { insertTranslation.run( row.id, t.target_lang, t.word, t.sense_hint ?? null, ); translations++; } } return { entries, translations, skipped }; }); const counts = importAll(); console.log(` entries: ${counts.entries.toLocaleString()}`); console.log(` translations: ${counts.translations.toLocaleString()}`); console.log(` skipped: ${counts.skipped.toLocaleString()}`); db.close(); console.log("\nImport complete."); } // ── Check if already imported ───────────────────────────────────────────────── export function isImported(): boolean { const db = openDb(); const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as { count: number; }; db.close(); return row.count > 0; } // ── Main ───────────────────────────────────────────────────────────────────── async function main(): Promise { const db = openDb(); const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as { count: number; }; db.close(); if (row.count > 0) { console.log( `pipeline.db already contains ${row.count.toLocaleString()} entries — skipping import.`, ); console.log("Delete pipeline.db and re-run db:init to start fresh."); process.exit(0); } await importKaikki(); } if (import.meta.url === `file://${process.argv[1]}`) { main().catch((err) => { console.error(err); process.exit(1); }); }