lila/data-pipeline/db/import.ts
lila 0cc643e308 feat: update extractor for all 5 languages, update import for multi-language
- Extract.ts now processes all 5 language files, filters non-English
  entries by lang_code, skips translation extraction for non-English
  (no translations in source files)
- Import.ts now imports all 5 language output files, uses language
  field from ExtractedSense instead of hardcoding en
- Sample limit hardcoded to 500 entries per language for development
2026-05-05 18:46:32 +02:00

154 lines
4.9 KiB
TypeScript

import fs from "node:fs/promises";
import path from "node:path";
import { fileURLToPath } from "node:url";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import { openDb } from "./index.js";
import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
// ── Paths ─────────────────────────────────────────────────────────────────────
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const OUTPUT_DIR = path.resolve(__dirname, "../stage-1-extract/output");
// ── Import ────────────────────────────────────────────────────────────────────
export async function importKaikki(): Promise<void> {
const db = openDb();
const insertEntry = db.prepare(`
INSERT INTO entries (headword, language, pos, sense_index, gloss, examples)
VALUES (?, ?, ?, ?, ?, ?)
ON CONFLICT (headword, language, pos, sense_index)
DO UPDATE SET
gloss = excluded.gloss,
examples = excluded.examples
RETURNING id
`);
const insertTranslation = db.prepare(`
INSERT INTO translations (entry_id, target_lang, word, sense_hint)
VALUES (?, ?, ?, ?)
ON CONFLICT (entry_id, target_lang, word) DO NOTHING
`);
let totalEntries = 0;
let totalTranslations = 0;
let totalSkipped = 0;
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
let senses: ExtractedSense[];
try {
const raw = await fs.readFile(filePath, "utf-8");
senses = JSON.parse(raw) as ExtractedSense[];
} catch {
console.warn(` Warning: no output file found for ${lang}, skipping`);
continue;
}
console.log(
` Importing ${lang}: ${senses.length.toLocaleString()} senses...`,
);
// Track next available sense_index per (headword, pos) to handle
// the same word appearing in multiple JSONL entries with the same POS.
const senseIndexMap = new Map<string, number>();
const importLang = db.transaction(() => {
let entries = 0;
let translations = 0;
let skipped = 0;
for (const sense of senses) {
const key = `${sense.headword}|${sense.pos}`;
const nextIndex = senseIndexMap.get(key) ?? 0;
senseIndexMap.set(key, nextIndex + 1);
const row = insertEntry.get(
sense.headword,
sense.language,
sense.pos,
nextIndex,
sense.gloss ?? null,
JSON.stringify(sense.examples),
) as { id: number } | undefined;
if (!row) {
skipped++;
continue;
}
entries++;
for (const t of sense.translations) {
insertTranslation.run(
row.id,
t.target_lang,
t.word,
t.sense_hint ?? null,
);
translations++;
}
}
return { entries, translations, skipped };
});
const counts = importLang();
totalEntries += counts.entries;
totalTranslations += counts.translations;
totalSkipped += counts.skipped;
console.log(
` entries: ${counts.entries.toLocaleString()}, translations: ${counts.translations.toLocaleString()}, skipped: ${counts.skipped.toLocaleString()}`,
);
}
db.close();
console.log(`\nImport complete:`);
console.log(` Total entries: ${totalEntries.toLocaleString()}`);
console.log(` Total translations: ${totalTranslations.toLocaleString()}`);
console.log(` Total skipped: ${totalSkipped.toLocaleString()}`);
}
// ── Check if already imported ─────────────────────────────────────────────────
export function isImported(): boolean {
const db = openDb();
const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
count: number;
};
db.close();
return row.count > 0;
}
// ── Main ─────────────────────────────────────────────────────────────────────
async function main(): Promise<void> {
const db = openDb();
const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
count: number;
};
db.close();
if (row.count > 0) {
console.log(
`pipeline.db already contains ${row.count.toLocaleString()} entries — skipping import.`,
);
console.log("Delete pipeline.db and re-run db:init to start fresh.");
process.exit(0);
}
console.log("Importing Kaikki data into pipeline.db...");
await importKaikki();
}
if (import.meta.url === `file://${process.argv[1]}`) {
main().catch((err) => {
console.error(err);
process.exit(1);
});
}