updating seeding script
This commit is contained in:
parent
aa1a332226
commit
570dbff25e
1 changed files with 212 additions and 0 deletions
|
|
@ -0,0 +1,212 @@
|
|||
import fs from "node:fs/promises";
|
||||
import { and, count, eq, inArray } from "drizzle-orm";
|
||||
|
||||
import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@glossa/shared";
|
||||
import { db } from "@glossa/db";
|
||||
import { terms, translations, term_glosses } from "@glossa/db/schema";
|
||||
|
||||
type POS = (typeof SUPPORTED_POS)[number];
|
||||
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
|
||||
|
||||
type SynsetRecord = {
|
||||
source_id: string;
|
||||
pos: POS;
|
||||
translations: Partial<Record<LanguageCode, string[]>>;
|
||||
glosses: Partial<Record<LanguageCode, string[]>>;
|
||||
};
|
||||
|
||||
const dataDir = "./src/data/datafiles/";
|
||||
const BATCH_SIZE = 500;
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// Helpers
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
||||
function chunk<T>(arr: T[], size: number): T[][] {
|
||||
const out: T[][] = [];
|
||||
for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
|
||||
return out;
|
||||
}
|
||||
|
||||
function fmt(n: number): string {
|
||||
return n.toLocaleString("en-US");
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// Stats
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
||||
const stats = {
|
||||
terms: { inserted: 0, skipped: 0 },
|
||||
translations: { inserted: 0, skipped: 0 },
|
||||
glosses: { inserted: 0, skipped: 0 },
|
||||
};
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// Per-batch processing
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
||||
async function processBatch(batch: SynsetRecord[]): Promise<void> {
|
||||
// 1. Insert terms — idempotency key: (source, source_id)
|
||||
const termValues = batch.map((r) => ({
|
||||
source: "omw" as const,
|
||||
source_id: r.source_id,
|
||||
pos: r.pos,
|
||||
}));
|
||||
|
||||
const insertedTerms = await db
|
||||
.insert(terms)
|
||||
.values(termValues)
|
||||
.onConflictDoNothing()
|
||||
.returning({ id: terms.id });
|
||||
|
||||
stats.terms.inserted += insertedTerms.length;
|
||||
stats.terms.skipped += batch.length - insertedTerms.length;
|
||||
|
||||
// 2. Resolve UUIDs for every source_id in this batch (new + pre-existing).
|
||||
// We can't rely solely on the .returning() above because onConflictDoNothing
|
||||
// returns nothing for rows that already existed.
|
||||
const sourceIds = batch.map((r) => r.source_id);
|
||||
const termRows = await db
|
||||
.select({ id: terms.id, source_id: terms.source_id })
|
||||
.from(terms)
|
||||
.where(and(eq(terms.source, "omw"), inArray(terms.source_id, sourceIds)));
|
||||
|
||||
const sourceIdToTermId = new Map(termRows.map((r) => [r.source_id, r.id]));
|
||||
|
||||
// 3. Build and insert translation rows
|
||||
const translationRows = batch.flatMap((r) => {
|
||||
const termId = sourceIdToTermId.get(r.source_id);
|
||||
if (!termId) return [];
|
||||
return Object.entries(r.translations).flatMap(([lang, lemmas]) =>
|
||||
(lemmas ?? []).map((text) => ({
|
||||
term_id: termId,
|
||||
language_code: lang as LanguageCode,
|
||||
text,
|
||||
})),
|
||||
);
|
||||
});
|
||||
|
||||
for (const tBatch of chunk(translationRows, BATCH_SIZE)) {
|
||||
const inserted = await db
|
||||
.insert(translations)
|
||||
.values(tBatch)
|
||||
.onConflictDoNothing()
|
||||
.returning({ id: translations.id });
|
||||
|
||||
stats.translations.inserted += inserted.length;
|
||||
stats.translations.skipped += tBatch.length - inserted.length;
|
||||
}
|
||||
|
||||
// 4. Build and insert gloss rows
|
||||
const glossRows = batch.flatMap((r) => {
|
||||
const termId = sourceIdToTermId.get(r.source_id);
|
||||
if (!termId) return [];
|
||||
return Object.entries(r.glosses ?? {}).flatMap(([lang, texts]) =>
|
||||
(texts ?? []).map((text) => ({
|
||||
term_id: termId,
|
||||
language_code: lang as LanguageCode,
|
||||
text,
|
||||
})),
|
||||
);
|
||||
});
|
||||
|
||||
for (const gBatch of chunk(glossRows, BATCH_SIZE)) {
|
||||
const inserted = await db
|
||||
.insert(term_glosses)
|
||||
.values(gBatch)
|
||||
.onConflictDoNothing()
|
||||
.returning({ id: term_glosses.id });
|
||||
|
||||
stats.glosses.inserted += inserted.length;
|
||||
stats.glosses.skipped += gBatch.length - inserted.length;
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// Main
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
||||
const main = async () => {
|
||||
console.log("\n##########################################");
|
||||
console.log("Glossa — OMW seed");
|
||||
console.log("##########################################\n");
|
||||
|
||||
// One file per POS — names are derived from SUPPORTED_POS so adding a new
|
||||
// constant value automatically picks up a new file on the next run.
|
||||
const posToFile = Object.fromEntries(
|
||||
SUPPORTED_POS.map((pos) => [pos, `omw-${pos}.json`]),
|
||||
) as Record<POS, string>;
|
||||
|
||||
for (const pos of SUPPORTED_POS) {
|
||||
const filename = posToFile[pos];
|
||||
const filepath = dataDir + filename;
|
||||
|
||||
console.log(`📄 ${filename}`);
|
||||
|
||||
let records: SynsetRecord[];
|
||||
try {
|
||||
const raw = await fs.readFile(filepath, "utf8");
|
||||
records = JSON.parse(raw) as SynsetRecord[];
|
||||
} catch (e) {
|
||||
console.warn(
|
||||
` ⚠️ Skipping — could not read file: ${(e as Error).message}\n`,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(` Loaded ${fmt(records.length)} synsets`);
|
||||
|
||||
const batches = chunk(records, BATCH_SIZE);
|
||||
|
||||
for (const [i, batch] of batches.entries()) {
|
||||
// Progress every 5 000 synsets
|
||||
if (i > 0 && i % 10 === 0) {
|
||||
const processed = i * BATCH_SIZE;
|
||||
console.log(` ⏳ ${fmt(processed)} / ${fmt(records.length)}`);
|
||||
}
|
||||
await processBatch(batch);
|
||||
}
|
||||
|
||||
console.log(` ✅ Done\n`);
|
||||
}
|
||||
|
||||
// ── Summary ───────────────────────────────────────────────
|
||||
|
||||
console.log("##########################################");
|
||||
console.log("Summary");
|
||||
console.log("##########################################\n");
|
||||
|
||||
const pad = (label: string) => label.padEnd(14);
|
||||
|
||||
console.log(
|
||||
`${pad("Terms:")}inserted ${fmt(stats.terms.inserted)}, skipped ${fmt(stats.terms.skipped)}`,
|
||||
);
|
||||
console.log(
|
||||
`${pad("Translations:")}inserted ${fmt(stats.translations.inserted)}, skipped ${fmt(stats.translations.skipped)}`,
|
||||
);
|
||||
console.log(
|
||||
`${pad("Glosses:")}inserted ${fmt(stats.glosses.inserted)}, skipped ${fmt(stats.glosses.skipped)}`,
|
||||
);
|
||||
|
||||
// Query actual DB totals — insert-based counters show 0 on re-runs.
|
||||
console.log("\nCoverage per language (total in DB):");
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
const [tRow] = await db
|
||||
.select({ n: count() })
|
||||
.from(translations)
|
||||
.where(eq(translations.language_code, lang));
|
||||
const [gRow] = await db
|
||||
.select({ n: count() })
|
||||
.from(term_glosses)
|
||||
.where(eq(term_glosses.language_code, lang));
|
||||
console.log(
|
||||
` ${lang}: ${fmt(tRow?.n ?? 0)} translations, ${fmt(gRow?.n ?? 0)} glosses`,
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue