feat: update extractor for all 5 languages, update import for multi-language
- Extract.ts now processes all 5 language files, filters non-English entries by lang_code, skips translation extraction for non-English (no translations in source files) - Import.ts now imports all 5 language output files, uses language field from ExtractedSense instead of hardcoding en - Sample limit hardcoded to 500 entries per language for development
This commit is contained in:
parent
209d52f54b
commit
0cc643e308
3 changed files with 173 additions and 107 deletions
|
|
@ -1,6 +1,7 @@
|
|||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||
import { openDb } from "./index.js";
|
||||
import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
|
||||
|
||||
|
|
@ -8,18 +9,11 @@ import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
|
|||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
const PATHS = {
|
||||
extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
|
||||
};
|
||||
const OUTPUT_DIR = path.resolve(__dirname, "../stage-1-extract/output");
|
||||
|
||||
// ── Import ────────────────────────────────────────────────────────────────────
|
||||
|
||||
export async function importKaikki(): Promise<void> {
|
||||
console.log("Loading extracted Kaikki data...");
|
||||
const raw = await fs.readFile(PATHS.extracted, "utf-8");
|
||||
const senses = JSON.parse(raw) as ExtractedSense[];
|
||||
console.log(` Loaded ${senses.length.toLocaleString()} senses`);
|
||||
|
||||
const db = openDb();
|
||||
|
||||
const insertEntry = db.prepare(`
|
||||
|
|
@ -38,64 +32,86 @@ export async function importKaikki(): Promise<void> {
|
|||
ON CONFLICT (entry_id, target_lang, word) DO NOTHING
|
||||
`);
|
||||
|
||||
// Track next available sense_index per (headword, pos) to handle
|
||||
// the same word appearing in multiple JSONL entries with the same POS.
|
||||
const senseIndexMap = new Map<string, number>();
|
||||
let totalEntries = 0;
|
||||
let totalTranslations = 0;
|
||||
let totalSkipped = 0;
|
||||
|
||||
console.log("\nImporting into pipeline.db...");
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
|
||||
|
||||
const importAll = db.transaction(() => {
|
||||
let entries = 0;
|
||||
let translations = 0;
|
||||
let skipped = 0;
|
||||
|
||||
for (const sense of senses) {
|
||||
const key = `${sense.headword}|${sense.pos}`;
|
||||
const nextIndex = senseIndexMap.get(key) ?? 0;
|
||||
|
||||
// Use the offset sense_index to avoid collisions when the same word
|
||||
// appears in multiple JSONL entries with the same POS.
|
||||
const senseIndex = nextIndex;
|
||||
senseIndexMap.set(key, nextIndex + 1);
|
||||
|
||||
const row = insertEntry.get(
|
||||
sense.headword,
|
||||
"en",
|
||||
sense.pos,
|
||||
senseIndex,
|
||||
sense.gloss ?? null,
|
||||
JSON.stringify(sense.examples),
|
||||
) as { id: number } | undefined;
|
||||
|
||||
if (!row) {
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
entries++;
|
||||
|
||||
for (const t of sense.translations) {
|
||||
insertTranslation.run(
|
||||
row.id,
|
||||
t.target_lang,
|
||||
t.word,
|
||||
t.sense_hint ?? null,
|
||||
);
|
||||
translations++;
|
||||
}
|
||||
let senses: ExtractedSense[];
|
||||
try {
|
||||
const raw = await fs.readFile(filePath, "utf-8");
|
||||
senses = JSON.parse(raw) as ExtractedSense[];
|
||||
} catch {
|
||||
console.warn(` Warning: no output file found for ${lang}, skipping`);
|
||||
continue;
|
||||
}
|
||||
|
||||
return { entries, translations, skipped };
|
||||
});
|
||||
console.log(
|
||||
` Importing ${lang}: ${senses.length.toLocaleString()} senses...`,
|
||||
);
|
||||
|
||||
const counts = importAll();
|
||||
// Track next available sense_index per (headword, pos) to handle
|
||||
// the same word appearing in multiple JSONL entries with the same POS.
|
||||
const senseIndexMap = new Map<string, number>();
|
||||
|
||||
console.log(` entries: ${counts.entries.toLocaleString()}`);
|
||||
console.log(` translations: ${counts.translations.toLocaleString()}`);
|
||||
console.log(` skipped: ${counts.skipped.toLocaleString()}`);
|
||||
const importLang = db.transaction(() => {
|
||||
let entries = 0;
|
||||
let translations = 0;
|
||||
let skipped = 0;
|
||||
|
||||
for (const sense of senses) {
|
||||
const key = `${sense.headword}|${sense.pos}`;
|
||||
const nextIndex = senseIndexMap.get(key) ?? 0;
|
||||
senseIndexMap.set(key, nextIndex + 1);
|
||||
|
||||
const row = insertEntry.get(
|
||||
sense.headword,
|
||||
sense.language,
|
||||
sense.pos,
|
||||
nextIndex,
|
||||
sense.gloss ?? null,
|
||||
JSON.stringify(sense.examples),
|
||||
) as { id: number } | undefined;
|
||||
|
||||
if (!row) {
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
entries++;
|
||||
|
||||
for (const t of sense.translations) {
|
||||
insertTranslation.run(
|
||||
row.id,
|
||||
t.target_lang,
|
||||
t.word,
|
||||
t.sense_hint ?? null,
|
||||
);
|
||||
translations++;
|
||||
}
|
||||
}
|
||||
|
||||
return { entries, translations, skipped };
|
||||
});
|
||||
|
||||
const counts = importLang();
|
||||
totalEntries += counts.entries;
|
||||
totalTranslations += counts.translations;
|
||||
totalSkipped += counts.skipped;
|
||||
|
||||
console.log(
|
||||
` entries: ${counts.entries.toLocaleString()}, translations: ${counts.translations.toLocaleString()}, skipped: ${counts.skipped.toLocaleString()}`,
|
||||
);
|
||||
}
|
||||
|
||||
db.close();
|
||||
console.log("\nImport complete.");
|
||||
|
||||
console.log(`\nImport complete:`);
|
||||
console.log(` Total entries: ${totalEntries.toLocaleString()}`);
|
||||
console.log(` Total translations: ${totalTranslations.toLocaleString()}`);
|
||||
console.log(` Total skipped: ${totalSkipped.toLocaleString()}`);
|
||||
}
|
||||
|
||||
// ── Check if already imported ─────────────────────────────────────────────────
|
||||
|
|
@ -126,6 +142,7 @@ async function main(): Promise<void> {
|
|||
process.exit(0);
|
||||
}
|
||||
|
||||
console.log("Importing Kaikki data into pipeline.db...");
|
||||
await importKaikki();
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue