feat: update extractor for all 5 languages, update import for multi-language
- Extract.ts now processes all 5 language files, filters non-English entries by lang_code, skips translation extraction for non-English (no translations in source files) - Import.ts now imports all 5 language output files, uses language field from ExtractedSense instead of hardcoding en - Sample limit hardcoded to 500 entries per language for development
This commit is contained in:
parent
209d52f54b
commit
0cc643e308
3 changed files with 173 additions and 107 deletions
|
|
@ -1,6 +1,7 @@
|
|||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||
import { openDb } from "./index.js";
|
||||
import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
|
||||
|
||||
|
|
@ -8,18 +9,11 @@ import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
|
|||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
const PATHS = {
|
||||
extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
|
||||
};
|
||||
const OUTPUT_DIR = path.resolve(__dirname, "../stage-1-extract/output");
|
||||
|
||||
// ── Import ────────────────────────────────────────────────────────────────────
|
||||
|
||||
export async function importKaikki(): Promise<void> {
|
||||
console.log("Loading extracted Kaikki data...");
|
||||
const raw = await fs.readFile(PATHS.extracted, "utf-8");
|
||||
const senses = JSON.parse(raw) as ExtractedSense[];
|
||||
console.log(` Loaded ${senses.length.toLocaleString()} senses`);
|
||||
|
||||
const db = openDb();
|
||||
|
||||
const insertEntry = db.prepare(`
|
||||
|
|
@ -38,64 +32,86 @@ export async function importKaikki(): Promise<void> {
|
|||
ON CONFLICT (entry_id, target_lang, word) DO NOTHING
|
||||
`);
|
||||
|
||||
// Track next available sense_index per (headword, pos) to handle
|
||||
// the same word appearing in multiple JSONL entries with the same POS.
|
||||
const senseIndexMap = new Map<string, number>();
|
||||
let totalEntries = 0;
|
||||
let totalTranslations = 0;
|
||||
let totalSkipped = 0;
|
||||
|
||||
console.log("\nImporting into pipeline.db...");
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
|
||||
|
||||
const importAll = db.transaction(() => {
|
||||
let entries = 0;
|
||||
let translations = 0;
|
||||
let skipped = 0;
|
||||
|
||||
for (const sense of senses) {
|
||||
const key = `${sense.headword}|${sense.pos}`;
|
||||
const nextIndex = senseIndexMap.get(key) ?? 0;
|
||||
|
||||
// Use the offset sense_index to avoid collisions when the same word
|
||||
// appears in multiple JSONL entries with the same POS.
|
||||
const senseIndex = nextIndex;
|
||||
senseIndexMap.set(key, nextIndex + 1);
|
||||
|
||||
const row = insertEntry.get(
|
||||
sense.headword,
|
||||
"en",
|
||||
sense.pos,
|
||||
senseIndex,
|
||||
sense.gloss ?? null,
|
||||
JSON.stringify(sense.examples),
|
||||
) as { id: number } | undefined;
|
||||
|
||||
if (!row) {
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
entries++;
|
||||
|
||||
for (const t of sense.translations) {
|
||||
insertTranslation.run(
|
||||
row.id,
|
||||
t.target_lang,
|
||||
t.word,
|
||||
t.sense_hint ?? null,
|
||||
);
|
||||
translations++;
|
||||
}
|
||||
let senses: ExtractedSense[];
|
||||
try {
|
||||
const raw = await fs.readFile(filePath, "utf-8");
|
||||
senses = JSON.parse(raw) as ExtractedSense[];
|
||||
} catch {
|
||||
console.warn(` Warning: no output file found for ${lang}, skipping`);
|
||||
continue;
|
||||
}
|
||||
|
||||
return { entries, translations, skipped };
|
||||
});
|
||||
console.log(
|
||||
` Importing ${lang}: ${senses.length.toLocaleString()} senses...`,
|
||||
);
|
||||
|
||||
const counts = importAll();
|
||||
// Track next available sense_index per (headword, pos) to handle
|
||||
// the same word appearing in multiple JSONL entries with the same POS.
|
||||
const senseIndexMap = new Map<string, number>();
|
||||
|
||||
console.log(` entries: ${counts.entries.toLocaleString()}`);
|
||||
console.log(` translations: ${counts.translations.toLocaleString()}`);
|
||||
console.log(` skipped: ${counts.skipped.toLocaleString()}`);
|
||||
const importLang = db.transaction(() => {
|
||||
let entries = 0;
|
||||
let translations = 0;
|
||||
let skipped = 0;
|
||||
|
||||
for (const sense of senses) {
|
||||
const key = `${sense.headword}|${sense.pos}`;
|
||||
const nextIndex = senseIndexMap.get(key) ?? 0;
|
||||
senseIndexMap.set(key, nextIndex + 1);
|
||||
|
||||
const row = insertEntry.get(
|
||||
sense.headword,
|
||||
sense.language,
|
||||
sense.pos,
|
||||
nextIndex,
|
||||
sense.gloss ?? null,
|
||||
JSON.stringify(sense.examples),
|
||||
) as { id: number } | undefined;
|
||||
|
||||
if (!row) {
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
entries++;
|
||||
|
||||
for (const t of sense.translations) {
|
||||
insertTranslation.run(
|
||||
row.id,
|
||||
t.target_lang,
|
||||
t.word,
|
||||
t.sense_hint ?? null,
|
||||
);
|
||||
translations++;
|
||||
}
|
||||
}
|
||||
|
||||
return { entries, translations, skipped };
|
||||
});
|
||||
|
||||
const counts = importLang();
|
||||
totalEntries += counts.entries;
|
||||
totalTranslations += counts.translations;
|
||||
totalSkipped += counts.skipped;
|
||||
|
||||
console.log(
|
||||
` entries: ${counts.entries.toLocaleString()}, translations: ${counts.translations.toLocaleString()}, skipped: ${counts.skipped.toLocaleString()}`,
|
||||
);
|
||||
}
|
||||
|
||||
db.close();
|
||||
console.log("\nImport complete.");
|
||||
|
||||
console.log(`\nImport complete:`);
|
||||
console.log(` Total entries: ${totalEntries.toLocaleString()}`);
|
||||
console.log(` Total translations: ${totalTranslations.toLocaleString()}`);
|
||||
console.log(` Total skipped: ${totalSkipped.toLocaleString()}`);
|
||||
}
|
||||
|
||||
// ── Check if already imported ─────────────────────────────────────────────────
|
||||
|
|
@ -126,6 +142,7 @@ async function main(): Promise<void> {
|
|||
process.exit(0);
|
||||
}
|
||||
|
||||
console.log("Importing Kaikki data into pipeline.db...");
|
||||
await importKaikki();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"extract": "tsx stage-1-extract/scripts/extract.ts",
|
||||
"db:import": "tsx db/import.ts",
|
||||
"db:init": "tsx db/init.ts",
|
||||
"annotate": "tsx stage-2-annotate/scripts/annotate.ts",
|
||||
|
|
|
|||
|
|
@ -20,10 +20,16 @@ type KaikkiSense = {
|
|||
translations?: KaikkiTranslation[];
|
||||
};
|
||||
|
||||
type KaikkiEntry = { word?: string; pos?: string; senses?: KaikkiSense[] };
|
||||
type KaikkiEntry = {
|
||||
word?: string;
|
||||
pos?: string;
|
||||
lang_code?: string;
|
||||
senses?: KaikkiSense[];
|
||||
};
|
||||
|
||||
export type ExtractedSense = {
|
||||
headword: string;
|
||||
language: SupportedLanguageCode;
|
||||
pos: SupportedPos;
|
||||
sense_index: number;
|
||||
gloss: string | null;
|
||||
|
|
@ -39,12 +45,15 @@ export type ExtractedSense = {
|
|||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
const PATHS = {
|
||||
source: path.resolve(
|
||||
__dirname,
|
||||
"../sources/kaikki.org-dictionary-English.jsonl",
|
||||
),
|
||||
output: path.resolve(__dirname, "../output/en.json"),
|
||||
const SOURCES_DIR = path.resolve(__dirname, "../sources");
|
||||
const OUTPUT_DIR = path.resolve(__dirname, "../output");
|
||||
|
||||
const LANG_TO_FILE: Record<SupportedLanguageCode, string> = {
|
||||
en: "kaikki.org-dictionary-English.jsonl",
|
||||
de: "kaikki.org-dictionary-German.jsonl",
|
||||
it: "kaikki.org-dictionary-Italian.jsonl",
|
||||
fr: "kaikki.org-dictionary-French.jsonl",
|
||||
es: "kaikki.org-dictionary-Spanish.jsonl",
|
||||
};
|
||||
|
||||
const POS_MAP: Record<string, SupportedPos> = {
|
||||
|
|
@ -68,13 +77,15 @@ function isAbbreviation(gloss: string): boolean {
|
|||
|
||||
function extractTranslations(
|
||||
sense: KaikkiSense,
|
||||
sourceLang: SupportedLanguageCode,
|
||||
): ExtractedSense["translations"] {
|
||||
const seen = new Set<string>();
|
||||
const result: ExtractedSense["translations"] = [];
|
||||
|
||||
for (const t of sense.translations ?? []) {
|
||||
const code = t.code ?? t.lang_code;
|
||||
if (!code || !SUPPORTED_LANG_SET.has(code) || code === "en") continue;
|
||||
if (!code || !SUPPORTED_LANG_SET.has(code)) continue;
|
||||
if (code === sourceLang) continue; // skip same-language translations
|
||||
if (!t.word?.trim()) continue;
|
||||
|
||||
const key = `${code}:${t.word.trim()}`;
|
||||
|
|
@ -97,58 +108,80 @@ function extractExamples(sense: KaikkiSense): string[] {
|
|||
.filter((t): t is string => !!t);
|
||||
}
|
||||
|
||||
function processEntry(entry: KaikkiEntry): ExtractedSense[] {
|
||||
function processEntry(
|
||||
entry: KaikkiEntry,
|
||||
sourceLang: SupportedLanguageCode,
|
||||
): Omit<ExtractedSense, "sense_index">[] {
|
||||
const pos = mapPos(entry.pos ?? "");
|
||||
if (!pos) return [];
|
||||
if (!entry.word?.trim()) return [];
|
||||
|
||||
// For non-English files, only process entries in the target language
|
||||
const entryLang = (entry as Record<string, unknown>)["lang_code"] as
|
||||
| string
|
||||
| undefined;
|
||||
if (sourceLang !== "en" && entryLang !== sourceLang) return [];
|
||||
|
||||
const headword = entry.word.trim();
|
||||
const results: ExtractedSense[] = [];
|
||||
let senseIndex = 0;
|
||||
const results: Omit<ExtractedSense, "sense_index">[] = [];
|
||||
|
||||
for (const sense of entry.senses ?? []) {
|
||||
const gloss = sense.glosses?.[0]?.trim() ?? null;
|
||||
|
||||
// Skip abbreviation senses
|
||||
if (gloss && isAbbreviation(gloss)) continue;
|
||||
|
||||
const translations = extractTranslations(sense);
|
||||
|
||||
// Skip senses with no translations in our supported languages
|
||||
if (translations.length === 0) continue;
|
||||
|
||||
results.push({
|
||||
headword,
|
||||
pos,
|
||||
sense_index: senseIndex++,
|
||||
gloss,
|
||||
examples: extractExamples(sense),
|
||||
translations,
|
||||
});
|
||||
if (sourceLang === "en") {
|
||||
// English: require translations in supported languages
|
||||
const translations = extractTranslations(sense, sourceLang);
|
||||
if (translations.length === 0) continue;
|
||||
results.push({
|
||||
headword,
|
||||
language: sourceLang,
|
||||
pos,
|
||||
gloss,
|
||||
examples: extractExamples(sense),
|
||||
translations,
|
||||
});
|
||||
} else {
|
||||
// Non-English: just extract the entry, no translations needed
|
||||
results.push({
|
||||
headword,
|
||||
language: sourceLang,
|
||||
pos,
|
||||
gloss,
|
||||
examples: extractExamples(sense),
|
||||
translations: [],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
// ── Main ──────────────────────────────────────────────────────────────────────
|
||||
// ── Extract ───────────────────────────────────────────────────────────────────
|
||||
|
||||
async function extract(sampleLimit?: number): Promise<void> {
|
||||
console.log("Extracting Kaikki English data...");
|
||||
console.log(` Source: ${PATHS.source}`);
|
||||
export async function extract(
|
||||
lang: SupportedLanguageCode,
|
||||
sampleLimit?: number,
|
||||
): Promise<void> {
|
||||
const filename = LANG_TO_FILE[lang];
|
||||
const sourcePath = path.join(SOURCES_DIR, filename);
|
||||
const outputPath = path.join(OUTPUT_DIR, `${lang}.json`);
|
||||
|
||||
if (sampleLimit) {
|
||||
console.log(` Sample mode: ${sampleLimit} entries`);
|
||||
}
|
||||
console.log(`\nExtracting ${lang}...`);
|
||||
console.log(` Source: ${sourcePath}`);
|
||||
if (sampleLimit) console.log(` Sample mode: ${sampleLimit} entries`);
|
||||
|
||||
await fs.promises.mkdir(path.dirname(PATHS.output), { recursive: true });
|
||||
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
|
||||
|
||||
const fileStream = fs.createReadStream(PATHS.source);
|
||||
const fileStream = fs.createReadStream(sourcePath);
|
||||
const rl = readline.createInterface({
|
||||
input: fileStream,
|
||||
crlfDelay: Infinity,
|
||||
});
|
||||
|
||||
const senses: ExtractedSense[] = [];
|
||||
const senseIndexMap = new Map<string, number>();
|
||||
let linesRead = 0;
|
||||
let entriesProcessed = 0;
|
||||
let entriesSkipped = 0;
|
||||
|
|
@ -167,14 +200,20 @@ async function extract(sampleLimit?: number): Promise<void> {
|
|||
continue;
|
||||
}
|
||||
|
||||
const extracted = processEntry(entry);
|
||||
const extracted = processEntry(entry, lang);
|
||||
|
||||
if (extracted.length === 0) {
|
||||
entriesSkipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
senses.push(...extracted);
|
||||
for (const sense of extracted) {
|
||||
const key = `${sense.headword}|${sense.pos}`;
|
||||
const senseIndex = senseIndexMap.get(key) ?? 0;
|
||||
senseIndexMap.set(key, senseIndex + 1);
|
||||
senses.push({ ...sense, sense_index: senseIndex });
|
||||
}
|
||||
|
||||
entriesProcessed++;
|
||||
|
||||
if (entriesProcessed % 10_000 === 0) {
|
||||
|
|
@ -185,25 +224,34 @@ async function extract(sampleLimit?: number): Promise<void> {
|
|||
}
|
||||
|
||||
await fs.promises.writeFile(
|
||||
PATHS.output,
|
||||
outputPath,
|
||||
JSON.stringify(senses, null, 2),
|
||||
"utf-8",
|
||||
);
|
||||
|
||||
console.log(`\nExtraction complete:`);
|
||||
console.log(` Lines read: ${linesRead.toLocaleString()}`);
|
||||
console.log(` Entries processed: ${entriesProcessed.toLocaleString()}`);
|
||||
console.log(` Entries skipped: ${entriesSkipped.toLocaleString()}`);
|
||||
console.log(` Senses extracted: ${senses.length.toLocaleString()}`);
|
||||
console.log(` Output: ${PATHS.output}`);
|
||||
console.log(` Lines read: ${linesRead.toLocaleString()}`);
|
||||
console.log(` Entries processed: ${entriesProcessed.toLocaleString()}`);
|
||||
console.log(` Entries skipped: ${entriesSkipped.toLocaleString()}`);
|
||||
console.log(` Senses extracted: ${senses.length.toLocaleString()}`);
|
||||
console.log(` Output: ${outputPath}`);
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
// ── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main(): Promise<void> {
|
||||
// Hardcoded sample limit for initial testing — remove for full extraction
|
||||
await extract(500);
|
||||
// Hardcoded sample limit for development — remove for full extraction
|
||||
const SAMPLE = 500;
|
||||
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
await extract(lang, SAMPLE);
|
||||
}
|
||||
|
||||
console.log("\nExtraction complete.");
|
||||
}
|
||||
|
||||
if (import.meta.url === `file://${process.argv[1]}`) {
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue