- Extract.ts now processes all 5 language files, filters non-English entries by lang_code, skips translation extraction for non-English (no translations in source files) - Import.ts now imports all 5 language output files, uses language field from ExtractedSense instead of hardcoding en - Sample limit hardcoded to 500 entries per language for development
154 lines
4.9 KiB
TypeScript
154 lines
4.9 KiB
TypeScript
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
import { fileURLToPath } from "node:url";
|
|
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
|
import { openDb } from "./index.js";
|
|
import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
|
|
|
|
// ── Paths ─────────────────────────────────────────────────────────────────────
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
|
|
const OUTPUT_DIR = path.resolve(__dirname, "../stage-1-extract/output");
|
|
|
|
// ── Import ────────────────────────────────────────────────────────────────────
|
|
|
|
export async function importKaikki(): Promise<void> {
|
|
const db = openDb();
|
|
|
|
const insertEntry = db.prepare(`
|
|
INSERT INTO entries (headword, language, pos, sense_index, gloss, examples)
|
|
VALUES (?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT (headword, language, pos, sense_index)
|
|
DO UPDATE SET
|
|
gloss = excluded.gloss,
|
|
examples = excluded.examples
|
|
RETURNING id
|
|
`);
|
|
|
|
const insertTranslation = db.prepare(`
|
|
INSERT INTO translations (entry_id, target_lang, word, sense_hint)
|
|
VALUES (?, ?, ?, ?)
|
|
ON CONFLICT (entry_id, target_lang, word) DO NOTHING
|
|
`);
|
|
|
|
let totalEntries = 0;
|
|
let totalTranslations = 0;
|
|
let totalSkipped = 0;
|
|
|
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
|
const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
|
|
|
|
let senses: ExtractedSense[];
|
|
try {
|
|
const raw = await fs.readFile(filePath, "utf-8");
|
|
senses = JSON.parse(raw) as ExtractedSense[];
|
|
} catch {
|
|
console.warn(` Warning: no output file found for ${lang}, skipping`);
|
|
continue;
|
|
}
|
|
|
|
console.log(
|
|
` Importing ${lang}: ${senses.length.toLocaleString()} senses...`,
|
|
);
|
|
|
|
// Track next available sense_index per (headword, pos) to handle
|
|
// the same word appearing in multiple JSONL entries with the same POS.
|
|
const senseIndexMap = new Map<string, number>();
|
|
|
|
const importLang = db.transaction(() => {
|
|
let entries = 0;
|
|
let translations = 0;
|
|
let skipped = 0;
|
|
|
|
for (const sense of senses) {
|
|
const key = `${sense.headword}|${sense.pos}`;
|
|
const nextIndex = senseIndexMap.get(key) ?? 0;
|
|
senseIndexMap.set(key, nextIndex + 1);
|
|
|
|
const row = insertEntry.get(
|
|
sense.headword,
|
|
sense.language,
|
|
sense.pos,
|
|
nextIndex,
|
|
sense.gloss ?? null,
|
|
JSON.stringify(sense.examples),
|
|
) as { id: number } | undefined;
|
|
|
|
if (!row) {
|
|
skipped++;
|
|
continue;
|
|
}
|
|
|
|
entries++;
|
|
|
|
for (const t of sense.translations) {
|
|
insertTranslation.run(
|
|
row.id,
|
|
t.target_lang,
|
|
t.word,
|
|
t.sense_hint ?? null,
|
|
);
|
|
translations++;
|
|
}
|
|
}
|
|
|
|
return { entries, translations, skipped };
|
|
});
|
|
|
|
const counts = importLang();
|
|
totalEntries += counts.entries;
|
|
totalTranslations += counts.translations;
|
|
totalSkipped += counts.skipped;
|
|
|
|
console.log(
|
|
` entries: ${counts.entries.toLocaleString()}, translations: ${counts.translations.toLocaleString()}, skipped: ${counts.skipped.toLocaleString()}`,
|
|
);
|
|
}
|
|
|
|
db.close();
|
|
|
|
console.log(`\nImport complete:`);
|
|
console.log(` Total entries: ${totalEntries.toLocaleString()}`);
|
|
console.log(` Total translations: ${totalTranslations.toLocaleString()}`);
|
|
console.log(` Total skipped: ${totalSkipped.toLocaleString()}`);
|
|
}
|
|
|
|
// ── Check if already imported ─────────────────────────────────────────────────
|
|
|
|
export function isImported(): boolean {
|
|
const db = openDb();
|
|
const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
|
|
count: number;
|
|
};
|
|
db.close();
|
|
return row.count > 0;
|
|
}
|
|
|
|
// ── Main ─────────────────────────────────────────────────────────────────────
|
|
|
|
async function main(): Promise<void> {
|
|
const db = openDb();
|
|
const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
|
|
count: number;
|
|
};
|
|
db.close();
|
|
|
|
if (row.count > 0) {
|
|
console.log(
|
|
`pipeline.db already contains ${row.count.toLocaleString()} entries — skipping import.`,
|
|
);
|
|
console.log("Delete pipeline.db and re-run db:init to start fresh.");
|
|
process.exit(0);
|
|
}
|
|
|
|
console.log("Importing Kaikki data into pipeline.db...");
|
|
await importKaikki();
|
|
}
|
|
|
|
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
main().catch((err) => {
|
|
console.error(err);
|
|
process.exit(1);
|
|
});
|
|
}
|