feat: update extractor for all 5 languages, update import for multi-language

- Extract.ts now processes all 5 language files, filters non-English
  entries by lang_code, skips translation extraction for non-English
  (no translations in source files)
- Import.ts now imports all 5 language output files, uses language
  field from ExtractedSense instead of hardcoding en
- Sample limit hardcoded to 500 entries per language for development
This commit is contained in:
lila 2026-05-05 18:46:32 +02:00
parent 209d52f54b
commit 0cc643e308
3 changed files with 173 additions and 107 deletions

View file

@ -1,6 +1,7 @@
import fs from "node:fs/promises";
import path from "node:path";
import { fileURLToPath } from "node:url";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import { openDb } from "./index.js";
import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
@ -8,18 +9,11 @@ import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PATHS = {
extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
};
const OUTPUT_DIR = path.resolve(__dirname, "../stage-1-extract/output");
// ── Import ────────────────────────────────────────────────────────────────────
export async function importKaikki(): Promise<void> {
console.log("Loading extracted Kaikki data...");
const raw = await fs.readFile(PATHS.extracted, "utf-8");
const senses = JSON.parse(raw) as ExtractedSense[];
console.log(` Loaded ${senses.length.toLocaleString()} senses`);
const db = openDb();
const insertEntry = db.prepare(`
@ -38,64 +32,86 @@ export async function importKaikki(): Promise<void> {
ON CONFLICT (entry_id, target_lang, word) DO NOTHING
`);
// Track next available sense_index per (headword, pos) to handle
// the same word appearing in multiple JSONL entries with the same POS.
const senseIndexMap = new Map<string, number>();
let totalEntries = 0;
let totalTranslations = 0;
let totalSkipped = 0;
console.log("\nImporting into pipeline.db...");
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
const importAll = db.transaction(() => {
let entries = 0;
let translations = 0;
let skipped = 0;
for (const sense of senses) {
const key = `${sense.headword}|${sense.pos}`;
const nextIndex = senseIndexMap.get(key) ?? 0;
// Use the offset sense_index to avoid collisions when the same word
// appears in multiple JSONL entries with the same POS.
const senseIndex = nextIndex;
senseIndexMap.set(key, nextIndex + 1);
const row = insertEntry.get(
sense.headword,
"en",
sense.pos,
senseIndex,
sense.gloss ?? null,
JSON.stringify(sense.examples),
) as { id: number } | undefined;
if (!row) {
skipped++;
continue;
}
entries++;
for (const t of sense.translations) {
insertTranslation.run(
row.id,
t.target_lang,
t.word,
t.sense_hint ?? null,
);
translations++;
}
let senses: ExtractedSense[];
try {
const raw = await fs.readFile(filePath, "utf-8");
senses = JSON.parse(raw) as ExtractedSense[];
} catch {
console.warn(` Warning: no output file found for ${lang}, skipping`);
continue;
}
return { entries, translations, skipped };
});
console.log(
` Importing ${lang}: ${senses.length.toLocaleString()} senses...`,
);
const counts = importAll();
// Track next available sense_index per (headword, pos) to handle
// the same word appearing in multiple JSONL entries with the same POS.
const senseIndexMap = new Map<string, number>();
console.log(` entries: ${counts.entries.toLocaleString()}`);
console.log(` translations: ${counts.translations.toLocaleString()}`);
console.log(` skipped: ${counts.skipped.toLocaleString()}`);
const importLang = db.transaction(() => {
let entries = 0;
let translations = 0;
let skipped = 0;
for (const sense of senses) {
const key = `${sense.headword}|${sense.pos}`;
const nextIndex = senseIndexMap.get(key) ?? 0;
senseIndexMap.set(key, nextIndex + 1);
const row = insertEntry.get(
sense.headword,
sense.language,
sense.pos,
nextIndex,
sense.gloss ?? null,
JSON.stringify(sense.examples),
) as { id: number } | undefined;
if (!row) {
skipped++;
continue;
}
entries++;
for (const t of sense.translations) {
insertTranslation.run(
row.id,
t.target_lang,
t.word,
t.sense_hint ?? null,
);
translations++;
}
}
return { entries, translations, skipped };
});
const counts = importLang();
totalEntries += counts.entries;
totalTranslations += counts.translations;
totalSkipped += counts.skipped;
console.log(
` entries: ${counts.entries.toLocaleString()}, translations: ${counts.translations.toLocaleString()}, skipped: ${counts.skipped.toLocaleString()}`,
);
}
db.close();
console.log("\nImport complete.");
console.log(`\nImport complete:`);
console.log(` Total entries: ${totalEntries.toLocaleString()}`);
console.log(` Total translations: ${totalTranslations.toLocaleString()}`);
console.log(` Total skipped: ${totalSkipped.toLocaleString()}`);
}
// ── Check if already imported ─────────────────────────────────────────────────
@ -126,6 +142,7 @@ async function main(): Promise<void> {
process.exit(0);
}
console.log("Importing Kaikki data into pipeline.db...");
await importKaikki();
}