feat: update extractor for all 5 languages, update import for multi-language

- Extract.ts now processes all 5 language files, filters non-English
  entries by lang_code, skips translation extraction for non-English
  (no translations in source files)
- Import.ts now imports all 5 language output files, uses language
  field from ExtractedSense instead of hardcoding en
- Sample limit hardcoded to 500 entries per language for development
This commit is contained in:
lila 2026-05-05 18:46:32 +02:00
parent 209d52f54b
commit 0cc643e308
3 changed files with 173 additions and 107 deletions

View file

@ -1,6 +1,7 @@
import fs from "node:fs/promises"; import fs from "node:fs/promises";
import path from "node:path"; import path from "node:path";
import { fileURLToPath } from "node:url"; import { fileURLToPath } from "node:url";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import { openDb } from "./index.js"; import { openDb } from "./index.js";
import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js"; import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
@ -8,18 +9,11 @@ import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
const __dirname = path.dirname(fileURLToPath(import.meta.url)); const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PATHS = { const OUTPUT_DIR = path.resolve(__dirname, "../stage-1-extract/output");
extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
};
// ── Import ──────────────────────────────────────────────────────────────────── // ── Import ────────────────────────────────────────────────────────────────────
export async function importKaikki(): Promise<void> { export async function importKaikki(): Promise<void> {
console.log("Loading extracted Kaikki data...");
const raw = await fs.readFile(PATHS.extracted, "utf-8");
const senses = JSON.parse(raw) as ExtractedSense[];
console.log(` Loaded ${senses.length.toLocaleString()} senses`);
const db = openDb(); const db = openDb();
const insertEntry = db.prepare(` const insertEntry = db.prepare(`
@ -38,64 +32,86 @@ export async function importKaikki(): Promise<void> {
ON CONFLICT (entry_id, target_lang, word) DO NOTHING ON CONFLICT (entry_id, target_lang, word) DO NOTHING
`); `);
// Track next available sense_index per (headword, pos) to handle let totalEntries = 0;
// the same word appearing in multiple JSONL entries with the same POS. let totalTranslations = 0;
const senseIndexMap = new Map<string, number>(); let totalSkipped = 0;
console.log("\nImporting into pipeline.db..."); for (const lang of SUPPORTED_LANGUAGE_CODES) {
const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
const importAll = db.transaction(() => { let senses: ExtractedSense[];
let entries = 0; try {
let translations = 0; const raw = await fs.readFile(filePath, "utf-8");
let skipped = 0; senses = JSON.parse(raw) as ExtractedSense[];
} catch {
for (const sense of senses) { console.warn(` Warning: no output file found for ${lang}, skipping`);
const key = `${sense.headword}|${sense.pos}`; continue;
const nextIndex = senseIndexMap.get(key) ?? 0;
// Use the offset sense_index to avoid collisions when the same word
// appears in multiple JSONL entries with the same POS.
const senseIndex = nextIndex;
senseIndexMap.set(key, nextIndex + 1);
const row = insertEntry.get(
sense.headword,
"en",
sense.pos,
senseIndex,
sense.gloss ?? null,
JSON.stringify(sense.examples),
) as { id: number } | undefined;
if (!row) {
skipped++;
continue;
}
entries++;
for (const t of sense.translations) {
insertTranslation.run(
row.id,
t.target_lang,
t.word,
t.sense_hint ?? null,
);
translations++;
}
} }
return { entries, translations, skipped }; console.log(
}); ` Importing ${lang}: ${senses.length.toLocaleString()} senses...`,
);
const counts = importAll(); // Track next available sense_index per (headword, pos) to handle
// the same word appearing in multiple JSONL entries with the same POS.
const senseIndexMap = new Map<string, number>();
console.log(` entries: ${counts.entries.toLocaleString()}`); const importLang = db.transaction(() => {
console.log(` translations: ${counts.translations.toLocaleString()}`); let entries = 0;
console.log(` skipped: ${counts.skipped.toLocaleString()}`); let translations = 0;
let skipped = 0;
for (const sense of senses) {
const key = `${sense.headword}|${sense.pos}`;
const nextIndex = senseIndexMap.get(key) ?? 0;
senseIndexMap.set(key, nextIndex + 1);
const row = insertEntry.get(
sense.headword,
sense.language,
sense.pos,
nextIndex,
sense.gloss ?? null,
JSON.stringify(sense.examples),
) as { id: number } | undefined;
if (!row) {
skipped++;
continue;
}
entries++;
for (const t of sense.translations) {
insertTranslation.run(
row.id,
t.target_lang,
t.word,
t.sense_hint ?? null,
);
translations++;
}
}
return { entries, translations, skipped };
});
const counts = importLang();
totalEntries += counts.entries;
totalTranslations += counts.translations;
totalSkipped += counts.skipped;
console.log(
` entries: ${counts.entries.toLocaleString()}, translations: ${counts.translations.toLocaleString()}, skipped: ${counts.skipped.toLocaleString()}`,
);
}
db.close(); db.close();
console.log("\nImport complete.");
console.log(`\nImport complete:`);
console.log(` Total entries: ${totalEntries.toLocaleString()}`);
console.log(` Total translations: ${totalTranslations.toLocaleString()}`);
console.log(` Total skipped: ${totalSkipped.toLocaleString()}`);
} }
// ── Check if already imported ───────────────────────────────────────────────── // ── Check if already imported ─────────────────────────────────────────────────
@ -126,6 +142,7 @@ async function main(): Promise<void> {
process.exit(0); process.exit(0);
} }
console.log("Importing Kaikki data into pipeline.db...");
await importKaikki(); await importKaikki();
} }

View file

@ -4,6 +4,7 @@
"private": true, "private": true,
"type": "module", "type": "module",
"scripts": { "scripts": {
"extract": "tsx stage-1-extract/scripts/extract.ts",
"db:import": "tsx db/import.ts", "db:import": "tsx db/import.ts",
"db:init": "tsx db/init.ts", "db:init": "tsx db/init.ts",
"annotate": "tsx stage-2-annotate/scripts/annotate.ts", "annotate": "tsx stage-2-annotate/scripts/annotate.ts",

View file

@ -20,10 +20,16 @@ type KaikkiSense = {
translations?: KaikkiTranslation[]; translations?: KaikkiTranslation[];
}; };
type KaikkiEntry = { word?: string; pos?: string; senses?: KaikkiSense[] }; type KaikkiEntry = {
word?: string;
pos?: string;
lang_code?: string;
senses?: KaikkiSense[];
};
export type ExtractedSense = { export type ExtractedSense = {
headword: string; headword: string;
language: SupportedLanguageCode;
pos: SupportedPos; pos: SupportedPos;
sense_index: number; sense_index: number;
gloss: string | null; gloss: string | null;
@ -39,12 +45,15 @@ export type ExtractedSense = {
const __dirname = path.dirname(fileURLToPath(import.meta.url)); const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PATHS = { const SOURCES_DIR = path.resolve(__dirname, "../sources");
source: path.resolve( const OUTPUT_DIR = path.resolve(__dirname, "../output");
__dirname,
"../sources/kaikki.org-dictionary-English.jsonl", const LANG_TO_FILE: Record<SupportedLanguageCode, string> = {
), en: "kaikki.org-dictionary-English.jsonl",
output: path.resolve(__dirname, "../output/en.json"), de: "kaikki.org-dictionary-German.jsonl",
it: "kaikki.org-dictionary-Italian.jsonl",
fr: "kaikki.org-dictionary-French.jsonl",
es: "kaikki.org-dictionary-Spanish.jsonl",
}; };
const POS_MAP: Record<string, SupportedPos> = { const POS_MAP: Record<string, SupportedPos> = {
@ -68,13 +77,15 @@ function isAbbreviation(gloss: string): boolean {
function extractTranslations( function extractTranslations(
sense: KaikkiSense, sense: KaikkiSense,
sourceLang: SupportedLanguageCode,
): ExtractedSense["translations"] { ): ExtractedSense["translations"] {
const seen = new Set<string>(); const seen = new Set<string>();
const result: ExtractedSense["translations"] = []; const result: ExtractedSense["translations"] = [];
for (const t of sense.translations ?? []) { for (const t of sense.translations ?? []) {
const code = t.code ?? t.lang_code; const code = t.code ?? t.lang_code;
if (!code || !SUPPORTED_LANG_SET.has(code) || code === "en") continue; if (!code || !SUPPORTED_LANG_SET.has(code)) continue;
if (code === sourceLang) continue; // skip same-language translations
if (!t.word?.trim()) continue; if (!t.word?.trim()) continue;
const key = `${code}:${t.word.trim()}`; const key = `${code}:${t.word.trim()}`;
@ -97,58 +108,80 @@ function extractExamples(sense: KaikkiSense): string[] {
.filter((t): t is string => !!t); .filter((t): t is string => !!t);
} }
function processEntry(entry: KaikkiEntry): ExtractedSense[] { function processEntry(
entry: KaikkiEntry,
sourceLang: SupportedLanguageCode,
): Omit<ExtractedSense, "sense_index">[] {
const pos = mapPos(entry.pos ?? ""); const pos = mapPos(entry.pos ?? "");
if (!pos) return []; if (!pos) return [];
if (!entry.word?.trim()) return []; if (!entry.word?.trim()) return [];
// For non-English files, only process entries in the target language
const entryLang = (entry as Record<string, unknown>)["lang_code"] as
| string
| undefined;
if (sourceLang !== "en" && entryLang !== sourceLang) return [];
const headword = entry.word.trim(); const headword = entry.word.trim();
const results: ExtractedSense[] = []; const results: Omit<ExtractedSense, "sense_index">[] = [];
let senseIndex = 0;
for (const sense of entry.senses ?? []) { for (const sense of entry.senses ?? []) {
const gloss = sense.glosses?.[0]?.trim() ?? null; const gloss = sense.glosses?.[0]?.trim() ?? null;
// Skip abbreviation senses
if (gloss && isAbbreviation(gloss)) continue; if (gloss && isAbbreviation(gloss)) continue;
const translations = extractTranslations(sense); if (sourceLang === "en") {
// English: require translations in supported languages
// Skip senses with no translations in our supported languages const translations = extractTranslations(sense, sourceLang);
if (translations.length === 0) continue; if (translations.length === 0) continue;
results.push({
results.push({ headword,
headword, language: sourceLang,
pos, pos,
sense_index: senseIndex++, gloss,
gloss, examples: extractExamples(sense),
examples: extractExamples(sense), translations,
translations, });
}); } else {
// Non-English: just extract the entry, no translations needed
results.push({
headword,
language: sourceLang,
pos,
gloss,
examples: extractExamples(sense),
translations: [],
});
}
} }
return results; return results;
} }
// ── Main ────────────────────────────────────────────────────────────────────── // ── Extract ───────────────────────────────────────────────────────────────────
async function extract(sampleLimit?: number): Promise<void> { export async function extract(
console.log("Extracting Kaikki English data..."); lang: SupportedLanguageCode,
console.log(` Source: ${PATHS.source}`); sampleLimit?: number,
): Promise<void> {
const filename = LANG_TO_FILE[lang];
const sourcePath = path.join(SOURCES_DIR, filename);
const outputPath = path.join(OUTPUT_DIR, `${lang}.json`);
if (sampleLimit) { console.log(`\nExtracting ${lang}...`);
console.log(` Sample mode: ${sampleLimit} entries`); console.log(` Source: ${sourcePath}`);
} if (sampleLimit) console.log(` Sample mode: ${sampleLimit} entries`);
await fs.promises.mkdir(path.dirname(PATHS.output), { recursive: true }); await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
const fileStream = fs.createReadStream(PATHS.source); const fileStream = fs.createReadStream(sourcePath);
const rl = readline.createInterface({ const rl = readline.createInterface({
input: fileStream, input: fileStream,
crlfDelay: Infinity, crlfDelay: Infinity,
}); });
const senses: ExtractedSense[] = []; const senses: ExtractedSense[] = [];
const senseIndexMap = new Map<string, number>();
let linesRead = 0; let linesRead = 0;
let entriesProcessed = 0; let entriesProcessed = 0;
let entriesSkipped = 0; let entriesSkipped = 0;
@ -167,14 +200,20 @@ async function extract(sampleLimit?: number): Promise<void> {
continue; continue;
} }
const extracted = processEntry(entry); const extracted = processEntry(entry, lang);
if (extracted.length === 0) { if (extracted.length === 0) {
entriesSkipped++; entriesSkipped++;
continue; continue;
} }
senses.push(...extracted); for (const sense of extracted) {
const key = `${sense.headword}|${sense.pos}`;
const senseIndex = senseIndexMap.get(key) ?? 0;
senseIndexMap.set(key, senseIndex + 1);
senses.push({ ...sense, sense_index: senseIndex });
}
entriesProcessed++; entriesProcessed++;
if (entriesProcessed % 10_000 === 0) { if (entriesProcessed % 10_000 === 0) {
@ -185,25 +224,34 @@ async function extract(sampleLimit?: number): Promise<void> {
} }
await fs.promises.writeFile( await fs.promises.writeFile(
PATHS.output, outputPath,
JSON.stringify(senses, null, 2), JSON.stringify(senses, null, 2),
"utf-8", "utf-8",
); );
console.log(`\nExtraction complete:`); console.log(` Lines read: ${linesRead.toLocaleString()}`);
console.log(` Lines read: ${linesRead.toLocaleString()}`); console.log(` Entries processed: ${entriesProcessed.toLocaleString()}`);
console.log(` Entries processed: ${entriesProcessed.toLocaleString()}`); console.log(` Entries skipped: ${entriesSkipped.toLocaleString()}`);
console.log(` Entries skipped: ${entriesSkipped.toLocaleString()}`); console.log(` Senses extracted: ${senses.length.toLocaleString()}`);
console.log(` Senses extracted: ${senses.length.toLocaleString()}`); console.log(` Output: ${outputPath}`);
console.log(` Output: ${PATHS.output}`);
} }
main().catch((err) => { // ── Main ─────────────────────────────────────────────────────────────────────
console.error(err);
process.exit(1);
});
async function main(): Promise<void> { async function main(): Promise<void> {
// Hardcoded sample limit for initial testing — remove for full extraction // Hardcoded sample limit for development — remove for full extraction
await extract(500); const SAMPLE = 500;
for (const lang of SUPPORTED_LANGUAGE_CODES) {
await extract(lang, SAMPLE);
}
console.log("\nExtraction complete.");
}
if (import.meta.url === `file://${process.argv[1]}`) {
main().catch((err) => {
console.error(err);
process.exit(1);
});
} }