diff --git a/data-pipeline/db/import.ts b/data-pipeline/db/import.ts index 548c4da..3733e81 100644 --- a/data-pipeline/db/import.ts +++ b/data-pipeline/db/import.ts @@ -1,6 +1,7 @@ import fs from "node:fs/promises"; import path from "node:path"; import { fileURLToPath } from "node:url"; +import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; import { openDb } from "./index.js"; import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js"; @@ -8,18 +9,11 @@ import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js"; const __dirname = path.dirname(fileURLToPath(import.meta.url)); -const PATHS = { - extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"), -}; +const OUTPUT_DIR = path.resolve(__dirname, "../stage-1-extract/output"); // ── Import ──────────────────────────────────────────────────────────────────── export async function importKaikki(): Promise { - console.log("Loading extracted Kaikki data..."); - const raw = await fs.readFile(PATHS.extracted, "utf-8"); - const senses = JSON.parse(raw) as ExtractedSense[]; - console.log(` Loaded ${senses.length.toLocaleString()} senses`); - const db = openDb(); const insertEntry = db.prepare(` @@ -38,64 +32,86 @@ export async function importKaikki(): Promise { ON CONFLICT (entry_id, target_lang, word) DO NOTHING `); - // Track next available sense_index per (headword, pos) to handle - // the same word appearing in multiple JSONL entries with the same POS. - const senseIndexMap = new Map(); + let totalEntries = 0; + let totalTranslations = 0; + let totalSkipped = 0; - console.log("\nImporting into pipeline.db..."); + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const filePath = path.join(OUTPUT_DIR, `${lang}.json`); - const importAll = db.transaction(() => { - let entries = 0; - let translations = 0; - let skipped = 0; - - for (const sense of senses) { - const key = `${sense.headword}|${sense.pos}`; - const nextIndex = senseIndexMap.get(key) ?? 0; - - // Use the offset sense_index to avoid collisions when the same word - // appears in multiple JSONL entries with the same POS. - const senseIndex = nextIndex; - senseIndexMap.set(key, nextIndex + 1); - - const row = insertEntry.get( - sense.headword, - "en", - sense.pos, - senseIndex, - sense.gloss ?? null, - JSON.stringify(sense.examples), - ) as { id: number } | undefined; - - if (!row) { - skipped++; - continue; - } - - entries++; - - for (const t of sense.translations) { - insertTranslation.run( - row.id, - t.target_lang, - t.word, - t.sense_hint ?? null, - ); - translations++; - } + let senses: ExtractedSense[]; + try { + const raw = await fs.readFile(filePath, "utf-8"); + senses = JSON.parse(raw) as ExtractedSense[]; + } catch { + console.warn(` Warning: no output file found for ${lang}, skipping`); + continue; } - return { entries, translations, skipped }; - }); + console.log( + ` Importing ${lang}: ${senses.length.toLocaleString()} senses...`, + ); - const counts = importAll(); + // Track next available sense_index per (headword, pos) to handle + // the same word appearing in multiple JSONL entries with the same POS. + const senseIndexMap = new Map(); - console.log(` entries: ${counts.entries.toLocaleString()}`); - console.log(` translations: ${counts.translations.toLocaleString()}`); - console.log(` skipped: ${counts.skipped.toLocaleString()}`); + const importLang = db.transaction(() => { + let entries = 0; + let translations = 0; + let skipped = 0; + + for (const sense of senses) { + const key = `${sense.headword}|${sense.pos}`; + const nextIndex = senseIndexMap.get(key) ?? 0; + senseIndexMap.set(key, nextIndex + 1); + + const row = insertEntry.get( + sense.headword, + sense.language, + sense.pos, + nextIndex, + sense.gloss ?? null, + JSON.stringify(sense.examples), + ) as { id: number } | undefined; + + if (!row) { + skipped++; + continue; + } + + entries++; + + for (const t of sense.translations) { + insertTranslation.run( + row.id, + t.target_lang, + t.word, + t.sense_hint ?? null, + ); + translations++; + } + } + + return { entries, translations, skipped }; + }); + + const counts = importLang(); + totalEntries += counts.entries; + totalTranslations += counts.translations; + totalSkipped += counts.skipped; + + console.log( + ` entries: ${counts.entries.toLocaleString()}, translations: ${counts.translations.toLocaleString()}, skipped: ${counts.skipped.toLocaleString()}`, + ); + } db.close(); - console.log("\nImport complete."); + + console.log(`\nImport complete:`); + console.log(` Total entries: ${totalEntries.toLocaleString()}`); + console.log(` Total translations: ${totalTranslations.toLocaleString()}`); + console.log(` Total skipped: ${totalSkipped.toLocaleString()}`); } // ── Check if already imported ───────────────────────────────────────────────── @@ -126,6 +142,7 @@ async function main(): Promise { process.exit(0); } + console.log("Importing Kaikki data into pipeline.db..."); await importKaikki(); } diff --git a/data-pipeline/package.json b/data-pipeline/package.json index 1510876..b25f26d 100644 --- a/data-pipeline/package.json +++ b/data-pipeline/package.json @@ -4,6 +4,7 @@ "private": true, "type": "module", "scripts": { + "extract": "tsx stage-1-extract/scripts/extract.ts", "db:import": "tsx db/import.ts", "db:init": "tsx db/init.ts", "annotate": "tsx stage-2-annotate/scripts/annotate.ts", diff --git a/data-pipeline/stage-1-extract/scripts/extract.ts b/data-pipeline/stage-1-extract/scripts/extract.ts index da82b71..22defc2 100644 --- a/data-pipeline/stage-1-extract/scripts/extract.ts +++ b/data-pipeline/stage-1-extract/scripts/extract.ts @@ -20,10 +20,16 @@ type KaikkiSense = { translations?: KaikkiTranslation[]; }; -type KaikkiEntry = { word?: string; pos?: string; senses?: KaikkiSense[] }; +type KaikkiEntry = { + word?: string; + pos?: string; + lang_code?: string; + senses?: KaikkiSense[]; +}; export type ExtractedSense = { headword: string; + language: SupportedLanguageCode; pos: SupportedPos; sense_index: number; gloss: string | null; @@ -39,12 +45,15 @@ export type ExtractedSense = { const __dirname = path.dirname(fileURLToPath(import.meta.url)); -const PATHS = { - source: path.resolve( - __dirname, - "../sources/kaikki.org-dictionary-English.jsonl", - ), - output: path.resolve(__dirname, "../output/en.json"), +const SOURCES_DIR = path.resolve(__dirname, "../sources"); +const OUTPUT_DIR = path.resolve(__dirname, "../output"); + +const LANG_TO_FILE: Record = { + en: "kaikki.org-dictionary-English.jsonl", + de: "kaikki.org-dictionary-German.jsonl", + it: "kaikki.org-dictionary-Italian.jsonl", + fr: "kaikki.org-dictionary-French.jsonl", + es: "kaikki.org-dictionary-Spanish.jsonl", }; const POS_MAP: Record = { @@ -68,13 +77,15 @@ function isAbbreviation(gloss: string): boolean { function extractTranslations( sense: KaikkiSense, + sourceLang: SupportedLanguageCode, ): ExtractedSense["translations"] { const seen = new Set(); const result: ExtractedSense["translations"] = []; for (const t of sense.translations ?? []) { const code = t.code ?? t.lang_code; - if (!code || !SUPPORTED_LANG_SET.has(code) || code === "en") continue; + if (!code || !SUPPORTED_LANG_SET.has(code)) continue; + if (code === sourceLang) continue; // skip same-language translations if (!t.word?.trim()) continue; const key = `${code}:${t.word.trim()}`; @@ -97,58 +108,80 @@ function extractExamples(sense: KaikkiSense): string[] { .filter((t): t is string => !!t); } -function processEntry(entry: KaikkiEntry): ExtractedSense[] { +function processEntry( + entry: KaikkiEntry, + sourceLang: SupportedLanguageCode, +): Omit[] { const pos = mapPos(entry.pos ?? ""); if (!pos) return []; if (!entry.word?.trim()) return []; + // For non-English files, only process entries in the target language + const entryLang = (entry as Record)["lang_code"] as + | string + | undefined; + if (sourceLang !== "en" && entryLang !== sourceLang) return []; + const headword = entry.word.trim(); - const results: ExtractedSense[] = []; - let senseIndex = 0; + const results: Omit[] = []; for (const sense of entry.senses ?? []) { const gloss = sense.glosses?.[0]?.trim() ?? null; - // Skip abbreviation senses if (gloss && isAbbreviation(gloss)) continue; - const translations = extractTranslations(sense); - - // Skip senses with no translations in our supported languages - if (translations.length === 0) continue; - - results.push({ - headword, - pos, - sense_index: senseIndex++, - gloss, - examples: extractExamples(sense), - translations, - }); + if (sourceLang === "en") { + // English: require translations in supported languages + const translations = extractTranslations(sense, sourceLang); + if (translations.length === 0) continue; + results.push({ + headword, + language: sourceLang, + pos, + gloss, + examples: extractExamples(sense), + translations, + }); + } else { + // Non-English: just extract the entry, no translations needed + results.push({ + headword, + language: sourceLang, + pos, + gloss, + examples: extractExamples(sense), + translations: [], + }); + } } return results; } -// ── Main ────────────────────────────────────────────────────────────────────── +// ── Extract ─────────────────────────────────────────────────────────────────── -async function extract(sampleLimit?: number): Promise { - console.log("Extracting Kaikki English data..."); - console.log(` Source: ${PATHS.source}`); +export async function extract( + lang: SupportedLanguageCode, + sampleLimit?: number, +): Promise { + const filename = LANG_TO_FILE[lang]; + const sourcePath = path.join(SOURCES_DIR, filename); + const outputPath = path.join(OUTPUT_DIR, `${lang}.json`); - if (sampleLimit) { - console.log(` Sample mode: ${sampleLimit} entries`); - } + console.log(`\nExtracting ${lang}...`); + console.log(` Source: ${sourcePath}`); + if (sampleLimit) console.log(` Sample mode: ${sampleLimit} entries`); - await fs.promises.mkdir(path.dirname(PATHS.output), { recursive: true }); + await fs.promises.mkdir(OUTPUT_DIR, { recursive: true }); - const fileStream = fs.createReadStream(PATHS.source); + const fileStream = fs.createReadStream(sourcePath); const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity, }); const senses: ExtractedSense[] = []; + const senseIndexMap = new Map(); let linesRead = 0; let entriesProcessed = 0; let entriesSkipped = 0; @@ -167,14 +200,20 @@ async function extract(sampleLimit?: number): Promise { continue; } - const extracted = processEntry(entry); + const extracted = processEntry(entry, lang); if (extracted.length === 0) { entriesSkipped++; continue; } - senses.push(...extracted); + for (const sense of extracted) { + const key = `${sense.headword}|${sense.pos}`; + const senseIndex = senseIndexMap.get(key) ?? 0; + senseIndexMap.set(key, senseIndex + 1); + senses.push({ ...sense, sense_index: senseIndex }); + } + entriesProcessed++; if (entriesProcessed % 10_000 === 0) { @@ -185,25 +224,34 @@ async function extract(sampleLimit?: number): Promise { } await fs.promises.writeFile( - PATHS.output, + outputPath, JSON.stringify(senses, null, 2), "utf-8", ); - console.log(`\nExtraction complete:`); - console.log(` Lines read: ${linesRead.toLocaleString()}`); - console.log(` Entries processed: ${entriesProcessed.toLocaleString()}`); - console.log(` Entries skipped: ${entriesSkipped.toLocaleString()}`); - console.log(` Senses extracted: ${senses.length.toLocaleString()}`); - console.log(` Output: ${PATHS.output}`); + console.log(` Lines read: ${linesRead.toLocaleString()}`); + console.log(` Entries processed: ${entriesProcessed.toLocaleString()}`); + console.log(` Entries skipped: ${entriesSkipped.toLocaleString()}`); + console.log(` Senses extracted: ${senses.length.toLocaleString()}`); + console.log(` Output: ${outputPath}`); } -main().catch((err) => { - console.error(err); - process.exit(1); -}); +// ── Main ───────────────────────────────────────────────────────────────────── async function main(): Promise { - // Hardcoded sample limit for initial testing — remove for full extraction - await extract(500); + // Hardcoded sample limit for development — remove for full extraction + const SAMPLE = 500; + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + await extract(lang, SAMPLE); + } + + console.log("\nExtraction complete."); +} + +if (import.meta.url === `file://${process.argv[1]}`) { + main().catch((err) => { + console.error(err); + process.exit(1); + }); }