feat: update extractor for all 5 languages, update import for multi-language
- Extract.ts now processes all 5 language files, filters non-English entries by lang_code, skips translation extraction for non-English (no translations in source files) - Import.ts now imports all 5 language output files, uses language field from ExtractedSense instead of hardcoding en - Sample limit hardcoded to 500 entries per language for development
This commit is contained in:
parent
209d52f54b
commit
0cc643e308
3 changed files with 173 additions and 107 deletions
|
|
@ -1,6 +1,7 @@
|
||||||
import fs from "node:fs/promises";
|
import fs from "node:fs/promises";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
import { fileURLToPath } from "node:url";
|
import { fileURLToPath } from "node:url";
|
||||||
|
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||||
import { openDb } from "./index.js";
|
import { openDb } from "./index.js";
|
||||||
import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
|
import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
|
||||||
|
|
||||||
|
|
@ -8,18 +9,11 @@ import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
|
||||||
|
|
||||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
const PATHS = {
|
const OUTPUT_DIR = path.resolve(__dirname, "../stage-1-extract/output");
|
||||||
extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
|
|
||||||
};
|
|
||||||
|
|
||||||
// ── Import ────────────────────────────────────────────────────────────────────
|
// ── Import ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
export async function importKaikki(): Promise<void> {
|
export async function importKaikki(): Promise<void> {
|
||||||
console.log("Loading extracted Kaikki data...");
|
|
||||||
const raw = await fs.readFile(PATHS.extracted, "utf-8");
|
|
||||||
const senses = JSON.parse(raw) as ExtractedSense[];
|
|
||||||
console.log(` Loaded ${senses.length.toLocaleString()} senses`);
|
|
||||||
|
|
||||||
const db = openDb();
|
const db = openDb();
|
||||||
|
|
||||||
const insertEntry = db.prepare(`
|
const insertEntry = db.prepare(`
|
||||||
|
|
@ -38,64 +32,86 @@ export async function importKaikki(): Promise<void> {
|
||||||
ON CONFLICT (entry_id, target_lang, word) DO NOTHING
|
ON CONFLICT (entry_id, target_lang, word) DO NOTHING
|
||||||
`);
|
`);
|
||||||
|
|
||||||
// Track next available sense_index per (headword, pos) to handle
|
let totalEntries = 0;
|
||||||
// the same word appearing in multiple JSONL entries with the same POS.
|
let totalTranslations = 0;
|
||||||
const senseIndexMap = new Map<string, number>();
|
let totalSkipped = 0;
|
||||||
|
|
||||||
console.log("\nImporting into pipeline.db...");
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
|
||||||
|
|
||||||
const importAll = db.transaction(() => {
|
let senses: ExtractedSense[];
|
||||||
let entries = 0;
|
try {
|
||||||
let translations = 0;
|
const raw = await fs.readFile(filePath, "utf-8");
|
||||||
let skipped = 0;
|
senses = JSON.parse(raw) as ExtractedSense[];
|
||||||
|
} catch {
|
||||||
for (const sense of senses) {
|
console.warn(` Warning: no output file found for ${lang}, skipping`);
|
||||||
const key = `${sense.headword}|${sense.pos}`;
|
continue;
|
||||||
const nextIndex = senseIndexMap.get(key) ?? 0;
|
|
||||||
|
|
||||||
// Use the offset sense_index to avoid collisions when the same word
|
|
||||||
// appears in multiple JSONL entries with the same POS.
|
|
||||||
const senseIndex = nextIndex;
|
|
||||||
senseIndexMap.set(key, nextIndex + 1);
|
|
||||||
|
|
||||||
const row = insertEntry.get(
|
|
||||||
sense.headword,
|
|
||||||
"en",
|
|
||||||
sense.pos,
|
|
||||||
senseIndex,
|
|
||||||
sense.gloss ?? null,
|
|
||||||
JSON.stringify(sense.examples),
|
|
||||||
) as { id: number } | undefined;
|
|
||||||
|
|
||||||
if (!row) {
|
|
||||||
skipped++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
entries++;
|
|
||||||
|
|
||||||
for (const t of sense.translations) {
|
|
||||||
insertTranslation.run(
|
|
||||||
row.id,
|
|
||||||
t.target_lang,
|
|
||||||
t.word,
|
|
||||||
t.sense_hint ?? null,
|
|
||||||
);
|
|
||||||
translations++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return { entries, translations, skipped };
|
console.log(
|
||||||
});
|
` Importing ${lang}: ${senses.length.toLocaleString()} senses...`,
|
||||||
|
);
|
||||||
|
|
||||||
const counts = importAll();
|
// Track next available sense_index per (headword, pos) to handle
|
||||||
|
// the same word appearing in multiple JSONL entries with the same POS.
|
||||||
|
const senseIndexMap = new Map<string, number>();
|
||||||
|
|
||||||
console.log(` entries: ${counts.entries.toLocaleString()}`);
|
const importLang = db.transaction(() => {
|
||||||
console.log(` translations: ${counts.translations.toLocaleString()}`);
|
let entries = 0;
|
||||||
console.log(` skipped: ${counts.skipped.toLocaleString()}`);
|
let translations = 0;
|
||||||
|
let skipped = 0;
|
||||||
|
|
||||||
|
for (const sense of senses) {
|
||||||
|
const key = `${sense.headword}|${sense.pos}`;
|
||||||
|
const nextIndex = senseIndexMap.get(key) ?? 0;
|
||||||
|
senseIndexMap.set(key, nextIndex + 1);
|
||||||
|
|
||||||
|
const row = insertEntry.get(
|
||||||
|
sense.headword,
|
||||||
|
sense.language,
|
||||||
|
sense.pos,
|
||||||
|
nextIndex,
|
||||||
|
sense.gloss ?? null,
|
||||||
|
JSON.stringify(sense.examples),
|
||||||
|
) as { id: number } | undefined;
|
||||||
|
|
||||||
|
if (!row) {
|
||||||
|
skipped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
entries++;
|
||||||
|
|
||||||
|
for (const t of sense.translations) {
|
||||||
|
insertTranslation.run(
|
||||||
|
row.id,
|
||||||
|
t.target_lang,
|
||||||
|
t.word,
|
||||||
|
t.sense_hint ?? null,
|
||||||
|
);
|
||||||
|
translations++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { entries, translations, skipped };
|
||||||
|
});
|
||||||
|
|
||||||
|
const counts = importLang();
|
||||||
|
totalEntries += counts.entries;
|
||||||
|
totalTranslations += counts.translations;
|
||||||
|
totalSkipped += counts.skipped;
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
` entries: ${counts.entries.toLocaleString()}, translations: ${counts.translations.toLocaleString()}, skipped: ${counts.skipped.toLocaleString()}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
db.close();
|
db.close();
|
||||||
console.log("\nImport complete.");
|
|
||||||
|
console.log(`\nImport complete:`);
|
||||||
|
console.log(` Total entries: ${totalEntries.toLocaleString()}`);
|
||||||
|
console.log(` Total translations: ${totalTranslations.toLocaleString()}`);
|
||||||
|
console.log(` Total skipped: ${totalSkipped.toLocaleString()}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Check if already imported ─────────────────────────────────────────────────
|
// ── Check if already imported ─────────────────────────────────────────────────
|
||||||
|
|
@ -126,6 +142,7 @@ async function main(): Promise<void> {
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log("Importing Kaikki data into pipeline.db...");
|
||||||
await importKaikki();
|
await importKaikki();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@
|
||||||
"private": true,
|
"private": true,
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
"extract": "tsx stage-1-extract/scripts/extract.ts",
|
||||||
"db:import": "tsx db/import.ts",
|
"db:import": "tsx db/import.ts",
|
||||||
"db:init": "tsx db/init.ts",
|
"db:init": "tsx db/init.ts",
|
||||||
"annotate": "tsx stage-2-annotate/scripts/annotate.ts",
|
"annotate": "tsx stage-2-annotate/scripts/annotate.ts",
|
||||||
|
|
|
||||||
|
|
@ -20,10 +20,16 @@ type KaikkiSense = {
|
||||||
translations?: KaikkiTranslation[];
|
translations?: KaikkiTranslation[];
|
||||||
};
|
};
|
||||||
|
|
||||||
type KaikkiEntry = { word?: string; pos?: string; senses?: KaikkiSense[] };
|
type KaikkiEntry = {
|
||||||
|
word?: string;
|
||||||
|
pos?: string;
|
||||||
|
lang_code?: string;
|
||||||
|
senses?: KaikkiSense[];
|
||||||
|
};
|
||||||
|
|
||||||
export type ExtractedSense = {
|
export type ExtractedSense = {
|
||||||
headword: string;
|
headword: string;
|
||||||
|
language: SupportedLanguageCode;
|
||||||
pos: SupportedPos;
|
pos: SupportedPos;
|
||||||
sense_index: number;
|
sense_index: number;
|
||||||
gloss: string | null;
|
gloss: string | null;
|
||||||
|
|
@ -39,12 +45,15 @@ export type ExtractedSense = {
|
||||||
|
|
||||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
const PATHS = {
|
const SOURCES_DIR = path.resolve(__dirname, "../sources");
|
||||||
source: path.resolve(
|
const OUTPUT_DIR = path.resolve(__dirname, "../output");
|
||||||
__dirname,
|
|
||||||
"../sources/kaikki.org-dictionary-English.jsonl",
|
const LANG_TO_FILE: Record<SupportedLanguageCode, string> = {
|
||||||
),
|
en: "kaikki.org-dictionary-English.jsonl",
|
||||||
output: path.resolve(__dirname, "../output/en.json"),
|
de: "kaikki.org-dictionary-German.jsonl",
|
||||||
|
it: "kaikki.org-dictionary-Italian.jsonl",
|
||||||
|
fr: "kaikki.org-dictionary-French.jsonl",
|
||||||
|
es: "kaikki.org-dictionary-Spanish.jsonl",
|
||||||
};
|
};
|
||||||
|
|
||||||
const POS_MAP: Record<string, SupportedPos> = {
|
const POS_MAP: Record<string, SupportedPos> = {
|
||||||
|
|
@ -68,13 +77,15 @@ function isAbbreviation(gloss: string): boolean {
|
||||||
|
|
||||||
function extractTranslations(
|
function extractTranslations(
|
||||||
sense: KaikkiSense,
|
sense: KaikkiSense,
|
||||||
|
sourceLang: SupportedLanguageCode,
|
||||||
): ExtractedSense["translations"] {
|
): ExtractedSense["translations"] {
|
||||||
const seen = new Set<string>();
|
const seen = new Set<string>();
|
||||||
const result: ExtractedSense["translations"] = [];
|
const result: ExtractedSense["translations"] = [];
|
||||||
|
|
||||||
for (const t of sense.translations ?? []) {
|
for (const t of sense.translations ?? []) {
|
||||||
const code = t.code ?? t.lang_code;
|
const code = t.code ?? t.lang_code;
|
||||||
if (!code || !SUPPORTED_LANG_SET.has(code) || code === "en") continue;
|
if (!code || !SUPPORTED_LANG_SET.has(code)) continue;
|
||||||
|
if (code === sourceLang) continue; // skip same-language translations
|
||||||
if (!t.word?.trim()) continue;
|
if (!t.word?.trim()) continue;
|
||||||
|
|
||||||
const key = `${code}:${t.word.trim()}`;
|
const key = `${code}:${t.word.trim()}`;
|
||||||
|
|
@ -97,58 +108,80 @@ function extractExamples(sense: KaikkiSense): string[] {
|
||||||
.filter((t): t is string => !!t);
|
.filter((t): t is string => !!t);
|
||||||
}
|
}
|
||||||
|
|
||||||
function processEntry(entry: KaikkiEntry): ExtractedSense[] {
|
function processEntry(
|
||||||
|
entry: KaikkiEntry,
|
||||||
|
sourceLang: SupportedLanguageCode,
|
||||||
|
): Omit<ExtractedSense, "sense_index">[] {
|
||||||
const pos = mapPos(entry.pos ?? "");
|
const pos = mapPos(entry.pos ?? "");
|
||||||
if (!pos) return [];
|
if (!pos) return [];
|
||||||
if (!entry.word?.trim()) return [];
|
if (!entry.word?.trim()) return [];
|
||||||
|
|
||||||
|
// For non-English files, only process entries in the target language
|
||||||
|
const entryLang = (entry as Record<string, unknown>)["lang_code"] as
|
||||||
|
| string
|
||||||
|
| undefined;
|
||||||
|
if (sourceLang !== "en" && entryLang !== sourceLang) return [];
|
||||||
|
|
||||||
const headword = entry.word.trim();
|
const headword = entry.word.trim();
|
||||||
const results: ExtractedSense[] = [];
|
const results: Omit<ExtractedSense, "sense_index">[] = [];
|
||||||
let senseIndex = 0;
|
|
||||||
|
|
||||||
for (const sense of entry.senses ?? []) {
|
for (const sense of entry.senses ?? []) {
|
||||||
const gloss = sense.glosses?.[0]?.trim() ?? null;
|
const gloss = sense.glosses?.[0]?.trim() ?? null;
|
||||||
|
|
||||||
// Skip abbreviation senses
|
|
||||||
if (gloss && isAbbreviation(gloss)) continue;
|
if (gloss && isAbbreviation(gloss)) continue;
|
||||||
|
|
||||||
const translations = extractTranslations(sense);
|
if (sourceLang === "en") {
|
||||||
|
// English: require translations in supported languages
|
||||||
// Skip senses with no translations in our supported languages
|
const translations = extractTranslations(sense, sourceLang);
|
||||||
if (translations.length === 0) continue;
|
if (translations.length === 0) continue;
|
||||||
|
results.push({
|
||||||
results.push({
|
headword,
|
||||||
headword,
|
language: sourceLang,
|
||||||
pos,
|
pos,
|
||||||
sense_index: senseIndex++,
|
gloss,
|
||||||
gloss,
|
examples: extractExamples(sense),
|
||||||
examples: extractExamples(sense),
|
translations,
|
||||||
translations,
|
});
|
||||||
});
|
} else {
|
||||||
|
// Non-English: just extract the entry, no translations needed
|
||||||
|
results.push({
|
||||||
|
headword,
|
||||||
|
language: sourceLang,
|
||||||
|
pos,
|
||||||
|
gloss,
|
||||||
|
examples: extractExamples(sense),
|
||||||
|
translations: [],
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Main ──────────────────────────────────────────────────────────────────────
|
// ── Extract ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
async function extract(sampleLimit?: number): Promise<void> {
|
export async function extract(
|
||||||
console.log("Extracting Kaikki English data...");
|
lang: SupportedLanguageCode,
|
||||||
console.log(` Source: ${PATHS.source}`);
|
sampleLimit?: number,
|
||||||
|
): Promise<void> {
|
||||||
|
const filename = LANG_TO_FILE[lang];
|
||||||
|
const sourcePath = path.join(SOURCES_DIR, filename);
|
||||||
|
const outputPath = path.join(OUTPUT_DIR, `${lang}.json`);
|
||||||
|
|
||||||
if (sampleLimit) {
|
console.log(`\nExtracting ${lang}...`);
|
||||||
console.log(` Sample mode: ${sampleLimit} entries`);
|
console.log(` Source: ${sourcePath}`);
|
||||||
}
|
if (sampleLimit) console.log(` Sample mode: ${sampleLimit} entries`);
|
||||||
|
|
||||||
await fs.promises.mkdir(path.dirname(PATHS.output), { recursive: true });
|
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
|
||||||
|
|
||||||
const fileStream = fs.createReadStream(PATHS.source);
|
const fileStream = fs.createReadStream(sourcePath);
|
||||||
const rl = readline.createInterface({
|
const rl = readline.createInterface({
|
||||||
input: fileStream,
|
input: fileStream,
|
||||||
crlfDelay: Infinity,
|
crlfDelay: Infinity,
|
||||||
});
|
});
|
||||||
|
|
||||||
const senses: ExtractedSense[] = [];
|
const senses: ExtractedSense[] = [];
|
||||||
|
const senseIndexMap = new Map<string, number>();
|
||||||
let linesRead = 0;
|
let linesRead = 0;
|
||||||
let entriesProcessed = 0;
|
let entriesProcessed = 0;
|
||||||
let entriesSkipped = 0;
|
let entriesSkipped = 0;
|
||||||
|
|
@ -167,14 +200,20 @@ async function extract(sampleLimit?: number): Promise<void> {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const extracted = processEntry(entry);
|
const extracted = processEntry(entry, lang);
|
||||||
|
|
||||||
if (extracted.length === 0) {
|
if (extracted.length === 0) {
|
||||||
entriesSkipped++;
|
entriesSkipped++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
senses.push(...extracted);
|
for (const sense of extracted) {
|
||||||
|
const key = `${sense.headword}|${sense.pos}`;
|
||||||
|
const senseIndex = senseIndexMap.get(key) ?? 0;
|
||||||
|
senseIndexMap.set(key, senseIndex + 1);
|
||||||
|
senses.push({ ...sense, sense_index: senseIndex });
|
||||||
|
}
|
||||||
|
|
||||||
entriesProcessed++;
|
entriesProcessed++;
|
||||||
|
|
||||||
if (entriesProcessed % 10_000 === 0) {
|
if (entriesProcessed % 10_000 === 0) {
|
||||||
|
|
@ -185,25 +224,34 @@ async function extract(sampleLimit?: number): Promise<void> {
|
||||||
}
|
}
|
||||||
|
|
||||||
await fs.promises.writeFile(
|
await fs.promises.writeFile(
|
||||||
PATHS.output,
|
outputPath,
|
||||||
JSON.stringify(senses, null, 2),
|
JSON.stringify(senses, null, 2),
|
||||||
"utf-8",
|
"utf-8",
|
||||||
);
|
);
|
||||||
|
|
||||||
console.log(`\nExtraction complete:`);
|
console.log(` Lines read: ${linesRead.toLocaleString()}`);
|
||||||
console.log(` Lines read: ${linesRead.toLocaleString()}`);
|
console.log(` Entries processed: ${entriesProcessed.toLocaleString()}`);
|
||||||
console.log(` Entries processed: ${entriesProcessed.toLocaleString()}`);
|
console.log(` Entries skipped: ${entriesSkipped.toLocaleString()}`);
|
||||||
console.log(` Entries skipped: ${entriesSkipped.toLocaleString()}`);
|
console.log(` Senses extracted: ${senses.length.toLocaleString()}`);
|
||||||
console.log(` Senses extracted: ${senses.length.toLocaleString()}`);
|
console.log(` Output: ${outputPath}`);
|
||||||
console.log(` Output: ${PATHS.output}`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
main().catch((err) => {
|
// ── Main ─────────────────────────────────────────────────────────────────────
|
||||||
console.error(err);
|
|
||||||
process.exit(1);
|
|
||||||
});
|
|
||||||
|
|
||||||
async function main(): Promise<void> {
|
async function main(): Promise<void> {
|
||||||
// Hardcoded sample limit for initial testing — remove for full extraction
|
// Hardcoded sample limit for development — remove for full extraction
|
||||||
await extract(500);
|
const SAMPLE = 500;
|
||||||
|
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
await extract(lang, SAMPLE);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log("\nExtraction complete.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (import.meta.url === `file://${process.argv[1]}`) {
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error(err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue