diff --git a/packages/db/src/seeding-datafiles.ts b/packages/db/src/seeding-datafiles.ts index e69de29..e14535b 100644 --- a/packages/db/src/seeding-datafiles.ts +++ b/packages/db/src/seeding-datafiles.ts @@ -0,0 +1,212 @@ +import fs from "node:fs/promises"; +import { and, count, eq, inArray } from "drizzle-orm"; + +import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@glossa/shared"; +import { db } from "@glossa/db"; +import { terms, translations, term_glosses } from "@glossa/db/schema"; + +type POS = (typeof SUPPORTED_POS)[number]; +type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number]; + +type SynsetRecord = { + source_id: string; + pos: POS; + translations: Partial>; + glosses: Partial>; +}; + +const dataDir = "./src/data/datafiles/"; +const BATCH_SIZE = 500; + +// ──────────────────────────────────────────────────────────── +// Helpers +// ──────────────────────────────────────────────────────────── + +function chunk(arr: T[], size: number): T[][] { + const out: T[][] = []; + for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size)); + return out; +} + +function fmt(n: number): string { + return n.toLocaleString("en-US"); +} + +// ──────────────────────────────────────────────────────────── +// Stats +// ──────────────────────────────────────────────────────────── + +const stats = { + terms: { inserted: 0, skipped: 0 }, + translations: { inserted: 0, skipped: 0 }, + glosses: { inserted: 0, skipped: 0 }, +}; + +// ──────────────────────────────────────────────────────────── +// Per-batch processing +// ──────────────────────────────────────────────────────────── + +async function processBatch(batch: SynsetRecord[]): Promise { + // 1. Insert terms — idempotency key: (source, source_id) + const termValues = batch.map((r) => ({ + source: "omw" as const, + source_id: r.source_id, + pos: r.pos, + })); + + const insertedTerms = await db + .insert(terms) + .values(termValues) + .onConflictDoNothing() + .returning({ id: terms.id }); + + stats.terms.inserted += insertedTerms.length; + stats.terms.skipped += batch.length - insertedTerms.length; + + // 2. Resolve UUIDs for every source_id in this batch (new + pre-existing). + // We can't rely solely on the .returning() above because onConflictDoNothing + // returns nothing for rows that already existed. + const sourceIds = batch.map((r) => r.source_id); + const termRows = await db + .select({ id: terms.id, source_id: terms.source_id }) + .from(terms) + .where(and(eq(terms.source, "omw"), inArray(terms.source_id, sourceIds))); + + const sourceIdToTermId = new Map(termRows.map((r) => [r.source_id, r.id])); + + // 3. Build and insert translation rows + const translationRows = batch.flatMap((r) => { + const termId = sourceIdToTermId.get(r.source_id); + if (!termId) return []; + return Object.entries(r.translations).flatMap(([lang, lemmas]) => + (lemmas ?? []).map((text) => ({ + term_id: termId, + language_code: lang as LanguageCode, + text, + })), + ); + }); + + for (const tBatch of chunk(translationRows, BATCH_SIZE)) { + const inserted = await db + .insert(translations) + .values(tBatch) + .onConflictDoNothing() + .returning({ id: translations.id }); + + stats.translations.inserted += inserted.length; + stats.translations.skipped += tBatch.length - inserted.length; + } + + // 4. Build and insert gloss rows + const glossRows = batch.flatMap((r) => { + const termId = sourceIdToTermId.get(r.source_id); + if (!termId) return []; + return Object.entries(r.glosses ?? {}).flatMap(([lang, texts]) => + (texts ?? []).map((text) => ({ + term_id: termId, + language_code: lang as LanguageCode, + text, + })), + ); + }); + + for (const gBatch of chunk(glossRows, BATCH_SIZE)) { + const inserted = await db + .insert(term_glosses) + .values(gBatch) + .onConflictDoNothing() + .returning({ id: term_glosses.id }); + + stats.glosses.inserted += inserted.length; + stats.glosses.skipped += gBatch.length - inserted.length; + } +} + +// ──────────────────────────────────────────────────────────── +// Main +// ──────────────────────────────────────────────────────────── + +const main = async () => { + console.log("\n##########################################"); + console.log("Glossa — OMW seed"); + console.log("##########################################\n"); + + // One file per POS — names are derived from SUPPORTED_POS so adding a new + // constant value automatically picks up a new file on the next run. + const posToFile = Object.fromEntries( + SUPPORTED_POS.map((pos) => [pos, `omw-${pos}.json`]), + ) as Record; + + for (const pos of SUPPORTED_POS) { + const filename = posToFile[pos]; + const filepath = dataDir + filename; + + console.log(`📄 ${filename}`); + + let records: SynsetRecord[]; + try { + const raw = await fs.readFile(filepath, "utf8"); + records = JSON.parse(raw) as SynsetRecord[]; + } catch (e) { + console.warn( + ` ⚠️ Skipping — could not read file: ${(e as Error).message}\n`, + ); + continue; + } + + console.log(` Loaded ${fmt(records.length)} synsets`); + + const batches = chunk(records, BATCH_SIZE); + + for (const [i, batch] of batches.entries()) { + // Progress every 5 000 synsets + if (i > 0 && i % 10 === 0) { + const processed = i * BATCH_SIZE; + console.log(` ⏳ ${fmt(processed)} / ${fmt(records.length)}`); + } + await processBatch(batch); + } + + console.log(` ✅ Done\n`); + } + + // ── Summary ─────────────────────────────────────────────── + + console.log("##########################################"); + console.log("Summary"); + console.log("##########################################\n"); + + const pad = (label: string) => label.padEnd(14); + + console.log( + `${pad("Terms:")}inserted ${fmt(stats.terms.inserted)}, skipped ${fmt(stats.terms.skipped)}`, + ); + console.log( + `${pad("Translations:")}inserted ${fmt(stats.translations.inserted)}, skipped ${fmt(stats.translations.skipped)}`, + ); + console.log( + `${pad("Glosses:")}inserted ${fmt(stats.glosses.inserted)}, skipped ${fmt(stats.glosses.skipped)}`, + ); + + // Query actual DB totals — insert-based counters show 0 on re-runs. + console.log("\nCoverage per language (total in DB):"); + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const [tRow] = await db + .select({ n: count() }) + .from(translations) + .where(eq(translations.language_code, lang)); + const [gRow] = await db + .select({ n: count() }) + .from(term_glosses) + .where(eq(term_glosses.language_code, lang)); + console.log( + ` ${lang}: ${fmt(tRow?.n ?? 0)} translations, ${fmt(gRow?.n ?? 0)} glosses`, + ); + } +}; + +main().catch((err) => { + console.error(err); + process.exit(1); +});