- Add stage-1-extract/scripts/extract.ts — streams Kaikki JSONL, filters to supported POS and languages, skips abbreviations and senses with no translations in supported languages - Rewrite db/import.ts for Kaikki flat model — tracks sense_index offsets per headword+pos to handle duplicate JSONL entries - Rewrite db/schema.sql for Kaikki model — entries, translations, LLM vote tables, resolved tables - Add extract and db:import scripts to package.json - Sample mode hardcoded to 500 entries for development
137 lines
4.3 KiB
TypeScript
137 lines
4.3 KiB
TypeScript
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
import { fileURLToPath } from "node:url";
|
|
import { openDb } from "./index.js";
|
|
import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
|
|
|
|
// ── Paths ─────────────────────────────────────────────────────────────────────
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
|
|
const PATHS = {
|
|
extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
|
|
};
|
|
|
|
// ── Import ────────────────────────────────────────────────────────────────────
|
|
|
|
export async function importKaikki(): Promise<void> {
|
|
console.log("Loading extracted Kaikki data...");
|
|
const raw = await fs.readFile(PATHS.extracted, "utf-8");
|
|
const senses = JSON.parse(raw) as ExtractedSense[];
|
|
console.log(` Loaded ${senses.length.toLocaleString()} senses`);
|
|
|
|
const db = openDb();
|
|
|
|
const insertEntry = db.prepare(`
|
|
INSERT INTO entries (headword, language, pos, sense_index, gloss, examples)
|
|
VALUES (?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT (headword, language, pos, sense_index)
|
|
DO UPDATE SET
|
|
gloss = excluded.gloss,
|
|
examples = excluded.examples
|
|
RETURNING id
|
|
`);
|
|
|
|
const insertTranslation = db.prepare(`
|
|
INSERT INTO translations (entry_id, target_lang, word, sense_hint)
|
|
VALUES (?, ?, ?, ?)
|
|
ON CONFLICT (entry_id, target_lang, word) DO NOTHING
|
|
`);
|
|
|
|
// Track next available sense_index per (headword, pos) to handle
|
|
// the same word appearing in multiple JSONL entries with the same POS.
|
|
const senseIndexMap = new Map<string, number>();
|
|
|
|
console.log("\nImporting into pipeline.db...");
|
|
|
|
const importAll = db.transaction(() => {
|
|
let entries = 0;
|
|
let translations = 0;
|
|
let skipped = 0;
|
|
|
|
for (const sense of senses) {
|
|
const key = `${sense.headword}|${sense.pos}`;
|
|
const nextIndex = senseIndexMap.get(key) ?? 0;
|
|
|
|
// Use the offset sense_index to avoid collisions when the same word
|
|
// appears in multiple JSONL entries with the same POS.
|
|
const senseIndex = nextIndex;
|
|
senseIndexMap.set(key, nextIndex + 1);
|
|
|
|
const row = insertEntry.get(
|
|
sense.headword,
|
|
"en",
|
|
sense.pos,
|
|
senseIndex,
|
|
sense.gloss ?? null,
|
|
JSON.stringify(sense.examples),
|
|
) as { id: number } | undefined;
|
|
|
|
if (!row) {
|
|
skipped++;
|
|
continue;
|
|
}
|
|
|
|
entries++;
|
|
|
|
for (const t of sense.translations) {
|
|
insertTranslation.run(
|
|
row.id,
|
|
t.target_lang,
|
|
t.word,
|
|
t.sense_hint ?? null,
|
|
);
|
|
translations++;
|
|
}
|
|
}
|
|
|
|
return { entries, translations, skipped };
|
|
});
|
|
|
|
const counts = importAll();
|
|
|
|
console.log(` entries: ${counts.entries.toLocaleString()}`);
|
|
console.log(` translations: ${counts.translations.toLocaleString()}`);
|
|
console.log(` skipped: ${counts.skipped.toLocaleString()}`);
|
|
|
|
db.close();
|
|
console.log("\nImport complete.");
|
|
}
|
|
|
|
// ── Check if already imported ─────────────────────────────────────────────────
|
|
|
|
export function isImported(): boolean {
|
|
const db = openDb();
|
|
const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
|
|
count: number;
|
|
};
|
|
db.close();
|
|
return row.count > 0;
|
|
}
|
|
|
|
// ── Main ─────────────────────────────────────────────────────────────────────
|
|
|
|
async function main(): Promise<void> {
|
|
const db = openDb();
|
|
const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
|
|
count: number;
|
|
};
|
|
db.close();
|
|
|
|
if (row.count > 0) {
|
|
console.log(
|
|
`pipeline.db already contains ${row.count.toLocaleString()} entries — skipping import.`,
|
|
);
|
|
console.log("Delete pipeline.db and re-run db:init to start fresh.");
|
|
process.exit(0);
|
|
}
|
|
|
|
await importKaikki();
|
|
}
|
|
|
|
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
main().catch((err) => {
|
|
console.error(err);
|
|
process.exit(1);
|
|
});
|
|
}
|