import fs from "node:fs/promises"; import path from "node:path"; import { fileURLToPath } from "node:url"; import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; import type { SupportedLanguageCode, SupportedPos } from "@lila/shared"; import { openDb } from "./index.js"; // ── Types ───────────────────────────────────────────────────────────────────── type Example = { text: string; source: "omw" | "cefr" }; type AnnotatedRecord = { source_id: string; pos: SupportedPos; translations: Partial>; glosses: Partial>; examples: Partial>; votes: Partial< Record> >; }; // ── Paths ───────────────────────────────────────────────────────────────────── const __dirname = path.dirname(fileURLToPath(import.meta.url)); const PATHS = { annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"), }; // ── Loading ─────────────────────────────────────────────────────────────────── async function loadAnnotated(): Promise { // Use en.json as the base — it has the most complete glosses and examples. // Merge votes and CEFR examples from the other language files. const baseRaw = await fs.readFile( path.join(PATHS.annotatedDir, "en.json"), "utf-8", ); const base = JSON.parse(baseRaw) as AnnotatedRecord[]; const byId = new Map(); for (const record of base) { byId.set(record.source_id, record); } for (const lang of SUPPORTED_LANGUAGE_CODES) { if (lang === "en") continue; const raw = await fs.readFile( path.join(PATHS.annotatedDir, `${lang}.json`), "utf-8", ); const records = JSON.parse(raw) as AnnotatedRecord[]; for (const record of records) { const base = byId.get(record.source_id); if (!base) continue; // Merge votes for (const [l, langVotes] of Object.entries(record.votes)) { if (!base.votes[l as SupportedLanguageCode]) { base.votes[l as SupportedLanguageCode] = {}; } Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes); } // Merge CEFR examples not already in base for (const [l, examples] of Object.entries(record.examples)) { const lang = l as SupportedLanguageCode; const cefrExamples = examples.filter((e) => e.source === "cefr"); if (cefrExamples.length === 0) continue; if (!base.examples[lang]) { base.examples[lang] = cefrExamples; } else { base.examples[lang].push(...cefrExamples); } } } } return [...byId.values()]; } // ── Import ──────────────────────────────────────────────────────────────────── export async function importStage2(): Promise { console.log("Loading stage 2 annotated files..."); const records = await loadAnnotated(); console.log(` Loaded ${records.length.toLocaleString()} synsets`); const db = openDb(); const insertSynset = db.prepare( `INSERT INTO synsets (source_id, pos) VALUES (?, ?)`, ); const insertTranslation = db.prepare( `INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`, ); const insertGloss = db.prepare( `INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`, ); const insertExample = db.prepare( `INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`, ); const insertCefrVote = db.prepare(` INSERT INTO cefr_source_votes (translation_id, cefr_level) VALUES ( (SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?), ? ) `); console.log("\nImporting into pipeline.db..."); const importAll = db.transaction(() => { let synsets = 0; let translations = 0; let glosses = 0; let examples = 0; let cefrVotes = 0; for (const record of records) { insertSynset.run(record.source_id, record.pos); synsets++; // Translations for (const [lang, words] of Object.entries(record.translations)) { const unique = [...new Set(words)]; for (const word of unique) { insertTranslation.run(record.source_id, lang, word); translations++; } } // Glosses for (const [lang, glossList] of Object.entries(record.glosses)) { for (const text of glossList) { insertGloss.run(record.source_id, lang, text); glosses++; } } // Examples for (const [lang, exList] of Object.entries(record.examples)) { for (const example of exList) { insertExample.run( record.source_id, lang, example.text, example.source, ); examples++; } } // CEFR source votes for (const [lang, langVotes] of Object.entries(record.votes)) { for (const [word, vote] of Object.entries( langVotes as Record, )) { insertCefrVote.run(record.source_id, lang, word, vote.cefr_source); cefrVotes++; } } } return { synsets, translations, glosses, examples, cefrVotes }; }); const counts = importAll(); console.log(` synsets: ${counts.synsets.toLocaleString()}`); console.log(` translations: ${counts.translations.toLocaleString()}`); console.log(` glosses: ${counts.glosses.toLocaleString()}`); console.log(` examples: ${counts.examples.toLocaleString()}`); console.log(` cefr votes: ${counts.cefrVotes.toLocaleString()}`); db.close(); console.log("\nImport complete."); } // ── Check if already imported ───────────────────────────────────────────────── export function isImported(): boolean { const db = openDb(); const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as { count: number; }; db.close(); return row.count > 0; } // ── Main ───────────────────────────────────────────────────────────────────── async function main(): Promise { const db = openDb(); const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as { count: number; }; db.close(); if (row.count > 0) { console.log( `pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`, ); console.log("Delete pipeline.db and re-run db:init to start fresh."); process.exit(0); } await importStage2(); } main().catch((err) => { console.error(err); process.exit(1); });