updating seeding script

This commit is contained in:
lila 2026-04-06 17:01:17 +02:00
parent aa1a332226
commit 570dbff25e

View file

@ -0,0 +1,212 @@
import fs from "node:fs/promises";
import { and, count, eq, inArray } from "drizzle-orm";
import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@glossa/shared";
import { db } from "@glossa/db";
import { terms, translations, term_glosses } from "@glossa/db/schema";
type POS = (typeof SUPPORTED_POS)[number];
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
type SynsetRecord = {
source_id: string;
pos: POS;
translations: Partial<Record<LanguageCode, string[]>>;
glosses: Partial<Record<LanguageCode, string[]>>;
};
const dataDir = "./src/data/datafiles/";
const BATCH_SIZE = 500;
// ────────────────────────────────────────────────────────────
// Helpers
// ────────────────────────────────────────────────────────────
function chunk<T>(arr: T[], size: number): T[][] {
const out: T[][] = [];
for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
return out;
}
function fmt(n: number): string {
return n.toLocaleString("en-US");
}
// ────────────────────────────────────────────────────────────
// Stats
// ────────────────────────────────────────────────────────────
const stats = {
terms: { inserted: 0, skipped: 0 },
translations: { inserted: 0, skipped: 0 },
glosses: { inserted: 0, skipped: 0 },
};
// ────────────────────────────────────────────────────────────
// Per-batch processing
// ────────────────────────────────────────────────────────────
async function processBatch(batch: SynsetRecord[]): Promise<void> {
// 1. Insert terms — idempotency key: (source, source_id)
const termValues = batch.map((r) => ({
source: "omw" as const,
source_id: r.source_id,
pos: r.pos,
}));
const insertedTerms = await db
.insert(terms)
.values(termValues)
.onConflictDoNothing()
.returning({ id: terms.id });
stats.terms.inserted += insertedTerms.length;
stats.terms.skipped += batch.length - insertedTerms.length;
// 2. Resolve UUIDs for every source_id in this batch (new + pre-existing).
// We can't rely solely on the .returning() above because onConflictDoNothing
// returns nothing for rows that already existed.
const sourceIds = batch.map((r) => r.source_id);
const termRows = await db
.select({ id: terms.id, source_id: terms.source_id })
.from(terms)
.where(and(eq(terms.source, "omw"), inArray(terms.source_id, sourceIds)));
const sourceIdToTermId = new Map(termRows.map((r) => [r.source_id, r.id]));
// 3. Build and insert translation rows
const translationRows = batch.flatMap((r) => {
const termId = sourceIdToTermId.get(r.source_id);
if (!termId) return [];
return Object.entries(r.translations).flatMap(([lang, lemmas]) =>
(lemmas ?? []).map((text) => ({
term_id: termId,
language_code: lang as LanguageCode,
text,
})),
);
});
for (const tBatch of chunk(translationRows, BATCH_SIZE)) {
const inserted = await db
.insert(translations)
.values(tBatch)
.onConflictDoNothing()
.returning({ id: translations.id });
stats.translations.inserted += inserted.length;
stats.translations.skipped += tBatch.length - inserted.length;
}
// 4. Build and insert gloss rows
const glossRows = batch.flatMap((r) => {
const termId = sourceIdToTermId.get(r.source_id);
if (!termId) return [];
return Object.entries(r.glosses ?? {}).flatMap(([lang, texts]) =>
(texts ?? []).map((text) => ({
term_id: termId,
language_code: lang as LanguageCode,
text,
})),
);
});
for (const gBatch of chunk(glossRows, BATCH_SIZE)) {
const inserted = await db
.insert(term_glosses)
.values(gBatch)
.onConflictDoNothing()
.returning({ id: term_glosses.id });
stats.glosses.inserted += inserted.length;
stats.glosses.skipped += gBatch.length - inserted.length;
}
}
// ────────────────────────────────────────────────────────────
// Main
// ────────────────────────────────────────────────────────────
const main = async () => {
console.log("\n##########################################");
console.log("Glossa — OMW seed");
console.log("##########################################\n");
// One file per POS — names are derived from SUPPORTED_POS so adding a new
// constant value automatically picks up a new file on the next run.
const posToFile = Object.fromEntries(
SUPPORTED_POS.map((pos) => [pos, `omw-${pos}.json`]),
) as Record<POS, string>;
for (const pos of SUPPORTED_POS) {
const filename = posToFile[pos];
const filepath = dataDir + filename;
console.log(`📄 ${filename}`);
let records: SynsetRecord[];
try {
const raw = await fs.readFile(filepath, "utf8");
records = JSON.parse(raw) as SynsetRecord[];
} catch (e) {
console.warn(
` ⚠️ Skipping — could not read file: ${(e as Error).message}\n`,
);
continue;
}
console.log(` Loaded ${fmt(records.length)} synsets`);
const batches = chunk(records, BATCH_SIZE);
for (const [i, batch] of batches.entries()) {
// Progress every 5 000 synsets
if (i > 0 && i % 10 === 0) {
const processed = i * BATCH_SIZE;
console.log(`${fmt(processed)} / ${fmt(records.length)}`);
}
await processBatch(batch);
}
console.log(` ✅ Done\n`);
}
// ── Summary ───────────────────────────────────────────────
console.log("##########################################");
console.log("Summary");
console.log("##########################################\n");
const pad = (label: string) => label.padEnd(14);
console.log(
`${pad("Terms:")}inserted ${fmt(stats.terms.inserted)}, skipped ${fmt(stats.terms.skipped)}`,
);
console.log(
`${pad("Translations:")}inserted ${fmt(stats.translations.inserted)}, skipped ${fmt(stats.translations.skipped)}`,
);
console.log(
`${pad("Glosses:")}inserted ${fmt(stats.glosses.inserted)}, skipped ${fmt(stats.glosses.skipped)}`,
);
// Query actual DB totals — insert-based counters show 0 on re-runs.
console.log("\nCoverage per language (total in DB):");
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const [tRow] = await db
.select({ n: count() })
.from(translations)
.where(eq(translations.language_code, lang));
const [gRow] = await db
.select({ n: count() })
.from(term_glosses)
.where(eq(term_glosses.language_code, lang));
console.log(
` ${lang}: ${fmt(tRow?.n ?? 0)} translations, ${fmt(gRow?.n ?? 0)} glosses`,
);
}
};
main().catch((err) => {
console.error(err);
process.exit(1);
});