feat: update extractor for all 5 languages, update import for multi-language

- Extract.ts now processes all 5 language files, filters non-English
  entries by lang_code, skips translation extraction for non-English
  (no translations in source files)
- Import.ts now imports all 5 language output files, uses language
  field from ExtractedSense instead of hardcoding en
- Sample limit hardcoded to 500 entries per language for development
This commit is contained in:
lila 2026-05-05 18:46:32 +02:00
parent 209d52f54b
commit 0cc643e308
3 changed files with 173 additions and 107 deletions

View file

@ -1,6 +1,7 @@
import fs from "node:fs/promises";
import path from "node:path";
import { fileURLToPath } from "node:url";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import { openDb } from "./index.js";
import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
@ -8,18 +9,11 @@ import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PATHS = {
extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
};
const OUTPUT_DIR = path.resolve(__dirname, "../stage-1-extract/output");
// ── Import ────────────────────────────────────────────────────────────────────
export async function importKaikki(): Promise<void> {
console.log("Loading extracted Kaikki data...");
const raw = await fs.readFile(PATHS.extracted, "utf-8");
const senses = JSON.parse(raw) as ExtractedSense[];
console.log(` Loaded ${senses.length.toLocaleString()} senses`);
const db = openDb();
const insertEntry = db.prepare(`
@ -38,13 +32,31 @@ export async function importKaikki(): Promise<void> {
ON CONFLICT (entry_id, target_lang, word) DO NOTHING
`);
let totalEntries = 0;
let totalTranslations = 0;
let totalSkipped = 0;
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
let senses: ExtractedSense[];
try {
const raw = await fs.readFile(filePath, "utf-8");
senses = JSON.parse(raw) as ExtractedSense[];
} catch {
console.warn(` Warning: no output file found for ${lang}, skipping`);
continue;
}
console.log(
` Importing ${lang}: ${senses.length.toLocaleString()} senses...`,
);
// Track next available sense_index per (headword, pos) to handle
// the same word appearing in multiple JSONL entries with the same POS.
const senseIndexMap = new Map<string, number>();
console.log("\nImporting into pipeline.db...");
const importAll = db.transaction(() => {
const importLang = db.transaction(() => {
let entries = 0;
let translations = 0;
let skipped = 0;
@ -52,17 +64,13 @@ export async function importKaikki(): Promise<void> {
for (const sense of senses) {
const key = `${sense.headword}|${sense.pos}`;
const nextIndex = senseIndexMap.get(key) ?? 0;
// Use the offset sense_index to avoid collisions when the same word
// appears in multiple JSONL entries with the same POS.
const senseIndex = nextIndex;
senseIndexMap.set(key, nextIndex + 1);
const row = insertEntry.get(
sense.headword,
"en",
sense.language,
sense.pos,
senseIndex,
nextIndex,
sense.gloss ?? null,
JSON.stringify(sense.examples),
) as { id: number } | undefined;
@ -88,14 +96,22 @@ export async function importKaikki(): Promise<void> {
return { entries, translations, skipped };
});
const counts = importAll();
const counts = importLang();
totalEntries += counts.entries;
totalTranslations += counts.translations;
totalSkipped += counts.skipped;
console.log(` entries: ${counts.entries.toLocaleString()}`);
console.log(` translations: ${counts.translations.toLocaleString()}`);
console.log(` skipped: ${counts.skipped.toLocaleString()}`);
console.log(
` entries: ${counts.entries.toLocaleString()}, translations: ${counts.translations.toLocaleString()}, skipped: ${counts.skipped.toLocaleString()}`,
);
}
db.close();
console.log("\nImport complete.");
console.log(`\nImport complete:`);
console.log(` Total entries: ${totalEntries.toLocaleString()}`);
console.log(` Total translations: ${totalTranslations.toLocaleString()}`);
console.log(` Total skipped: ${totalSkipped.toLocaleString()}`);
}
// ── Check if already imported ─────────────────────────────────────────────────
@ -126,6 +142,7 @@ async function main(): Promise<void> {
process.exit(0);
}
console.log("Importing Kaikki data into pipeline.db...");
await importKaikki();
}

View file

@ -4,6 +4,7 @@
"private": true,
"type": "module",
"scripts": {
"extract": "tsx stage-1-extract/scripts/extract.ts",
"db:import": "tsx db/import.ts",
"db:init": "tsx db/init.ts",
"annotate": "tsx stage-2-annotate/scripts/annotate.ts",

View file

@ -20,10 +20,16 @@ type KaikkiSense = {
translations?: KaikkiTranslation[];
};
type KaikkiEntry = { word?: string; pos?: string; senses?: KaikkiSense[] };
type KaikkiEntry = {
word?: string;
pos?: string;
lang_code?: string;
senses?: KaikkiSense[];
};
export type ExtractedSense = {
headword: string;
language: SupportedLanguageCode;
pos: SupportedPos;
sense_index: number;
gloss: string | null;
@ -39,12 +45,15 @@ export type ExtractedSense = {
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PATHS = {
source: path.resolve(
__dirname,
"../sources/kaikki.org-dictionary-English.jsonl",
),
output: path.resolve(__dirname, "../output/en.json"),
const SOURCES_DIR = path.resolve(__dirname, "../sources");
const OUTPUT_DIR = path.resolve(__dirname, "../output");
const LANG_TO_FILE: Record<SupportedLanguageCode, string> = {
en: "kaikki.org-dictionary-English.jsonl",
de: "kaikki.org-dictionary-German.jsonl",
it: "kaikki.org-dictionary-Italian.jsonl",
fr: "kaikki.org-dictionary-French.jsonl",
es: "kaikki.org-dictionary-Spanish.jsonl",
};
const POS_MAP: Record<string, SupportedPos> = {
@ -68,13 +77,15 @@ function isAbbreviation(gloss: string): boolean {
function extractTranslations(
sense: KaikkiSense,
sourceLang: SupportedLanguageCode,
): ExtractedSense["translations"] {
const seen = new Set<string>();
const result: ExtractedSense["translations"] = [];
for (const t of sense.translations ?? []) {
const code = t.code ?? t.lang_code;
if (!code || !SUPPORTED_LANG_SET.has(code) || code === "en") continue;
if (!code || !SUPPORTED_LANG_SET.has(code)) continue;
if (code === sourceLang) continue; // skip same-language translations
if (!t.word?.trim()) continue;
const key = `${code}:${t.word.trim()}`;
@ -97,58 +108,80 @@ function extractExamples(sense: KaikkiSense): string[] {
.filter((t): t is string => !!t);
}
function processEntry(entry: KaikkiEntry): ExtractedSense[] {
function processEntry(
entry: KaikkiEntry,
sourceLang: SupportedLanguageCode,
): Omit<ExtractedSense, "sense_index">[] {
const pos = mapPos(entry.pos ?? "");
if (!pos) return [];
if (!entry.word?.trim()) return [];
// For non-English files, only process entries in the target language
const entryLang = (entry as Record<string, unknown>)["lang_code"] as
| string
| undefined;
if (sourceLang !== "en" && entryLang !== sourceLang) return [];
const headword = entry.word.trim();
const results: ExtractedSense[] = [];
let senseIndex = 0;
const results: Omit<ExtractedSense, "sense_index">[] = [];
for (const sense of entry.senses ?? []) {
const gloss = sense.glosses?.[0]?.trim() ?? null;
// Skip abbreviation senses
if (gloss && isAbbreviation(gloss)) continue;
const translations = extractTranslations(sense);
// Skip senses with no translations in our supported languages
if (sourceLang === "en") {
// English: require translations in supported languages
const translations = extractTranslations(sense, sourceLang);
if (translations.length === 0) continue;
results.push({
headword,
language: sourceLang,
pos,
sense_index: senseIndex++,
gloss,
examples: extractExamples(sense),
translations,
});
} else {
// Non-English: just extract the entry, no translations needed
results.push({
headword,
language: sourceLang,
pos,
gloss,
examples: extractExamples(sense),
translations: [],
});
}
}
return results;
}
// ── Main ──────────────────────────────────────────────────────────────────────
// ── Extract ───────────────────────────────────────────────────────────────────
async function extract(sampleLimit?: number): Promise<void> {
console.log("Extracting Kaikki English data...");
console.log(` Source: ${PATHS.source}`);
export async function extract(
lang: SupportedLanguageCode,
sampleLimit?: number,
): Promise<void> {
const filename = LANG_TO_FILE[lang];
const sourcePath = path.join(SOURCES_DIR, filename);
const outputPath = path.join(OUTPUT_DIR, `${lang}.json`);
if (sampleLimit) {
console.log(` Sample mode: ${sampleLimit} entries`);
}
console.log(`\nExtracting ${lang}...`);
console.log(` Source: ${sourcePath}`);
if (sampleLimit) console.log(` Sample mode: ${sampleLimit} entries`);
await fs.promises.mkdir(path.dirname(PATHS.output), { recursive: true });
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
const fileStream = fs.createReadStream(PATHS.source);
const fileStream = fs.createReadStream(sourcePath);
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity,
});
const senses: ExtractedSense[] = [];
const senseIndexMap = new Map<string, number>();
let linesRead = 0;
let entriesProcessed = 0;
let entriesSkipped = 0;
@ -167,14 +200,20 @@ async function extract(sampleLimit?: number): Promise<void> {
continue;
}
const extracted = processEntry(entry);
const extracted = processEntry(entry, lang);
if (extracted.length === 0) {
entriesSkipped++;
continue;
}
senses.push(...extracted);
for (const sense of extracted) {
const key = `${sense.headword}|${sense.pos}`;
const senseIndex = senseIndexMap.get(key) ?? 0;
senseIndexMap.set(key, senseIndex + 1);
senses.push({ ...sense, sense_index: senseIndex });
}
entriesProcessed++;
if (entriesProcessed % 10_000 === 0) {
@ -185,25 +224,34 @@ async function extract(sampleLimit?: number): Promise<void> {
}
await fs.promises.writeFile(
PATHS.output,
outputPath,
JSON.stringify(senses, null, 2),
"utf-8",
);
console.log(`\nExtraction complete:`);
console.log(` Lines read: ${linesRead.toLocaleString()}`);
console.log(` Entries processed: ${entriesProcessed.toLocaleString()}`);
console.log(` Entries skipped: ${entriesSkipped.toLocaleString()}`);
console.log(` Senses extracted: ${senses.length.toLocaleString()}`);
console.log(` Output: ${PATHS.output}`);
console.log(` Output: ${outputPath}`);
}
main().catch((err) => {
console.error(err);
process.exit(1);
});
// ── Main ─────────────────────────────────────────────────────────────────────
async function main(): Promise<void> {
// Hardcoded sample limit for initial testing — remove for full extraction
await extract(500);
// Hardcoded sample limit for development — remove for full extraction
const SAMPLE = 500;
for (const lang of SUPPORTED_LANGUAGE_CODES) {
await extract(lang, SAMPLE);
}
console.log("\nExtraction complete.");
}
if (import.meta.url === `file://${process.argv[1]}`) {
main().catch((err) => {
console.error(err);
process.exit(1);
});
}