updating seeding script
This commit is contained in:
parent
aa1a332226
commit
570dbff25e
1 changed files with 212 additions and 0 deletions
|
|
@ -0,0 +1,212 @@
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import { and, count, eq, inArray } from "drizzle-orm";
|
||||||
|
|
||||||
|
import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@glossa/shared";
|
||||||
|
import { db } from "@glossa/db";
|
||||||
|
import { terms, translations, term_glosses } from "@glossa/db/schema";
|
||||||
|
|
||||||
|
type POS = (typeof SUPPORTED_POS)[number];
|
||||||
|
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
|
||||||
|
|
||||||
|
type SynsetRecord = {
|
||||||
|
source_id: string;
|
||||||
|
pos: POS;
|
||||||
|
translations: Partial<Record<LanguageCode, string[]>>;
|
||||||
|
glosses: Partial<Record<LanguageCode, string[]>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
const dataDir = "./src/data/datafiles/";
|
||||||
|
const BATCH_SIZE = 500;
|
||||||
|
|
||||||
|
// ────────────────────────────────────────────────────────────
|
||||||
|
// Helpers
|
||||||
|
// ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function chunk<T>(arr: T[], size: number): T[][] {
|
||||||
|
const out: T[][] = [];
|
||||||
|
for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
function fmt(n: number): string {
|
||||||
|
return n.toLocaleString("en-US");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ────────────────────────────────────────────────────────────
|
||||||
|
// Stats
|
||||||
|
// ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const stats = {
|
||||||
|
terms: { inserted: 0, skipped: 0 },
|
||||||
|
translations: { inserted: 0, skipped: 0 },
|
||||||
|
glosses: { inserted: 0, skipped: 0 },
|
||||||
|
};
|
||||||
|
|
||||||
|
// ────────────────────────────────────────────────────────────
|
||||||
|
// Per-batch processing
|
||||||
|
// ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async function processBatch(batch: SynsetRecord[]): Promise<void> {
|
||||||
|
// 1. Insert terms — idempotency key: (source, source_id)
|
||||||
|
const termValues = batch.map((r) => ({
|
||||||
|
source: "omw" as const,
|
||||||
|
source_id: r.source_id,
|
||||||
|
pos: r.pos,
|
||||||
|
}));
|
||||||
|
|
||||||
|
const insertedTerms = await db
|
||||||
|
.insert(terms)
|
||||||
|
.values(termValues)
|
||||||
|
.onConflictDoNothing()
|
||||||
|
.returning({ id: terms.id });
|
||||||
|
|
||||||
|
stats.terms.inserted += insertedTerms.length;
|
||||||
|
stats.terms.skipped += batch.length - insertedTerms.length;
|
||||||
|
|
||||||
|
// 2. Resolve UUIDs for every source_id in this batch (new + pre-existing).
|
||||||
|
// We can't rely solely on the .returning() above because onConflictDoNothing
|
||||||
|
// returns nothing for rows that already existed.
|
||||||
|
const sourceIds = batch.map((r) => r.source_id);
|
||||||
|
const termRows = await db
|
||||||
|
.select({ id: terms.id, source_id: terms.source_id })
|
||||||
|
.from(terms)
|
||||||
|
.where(and(eq(terms.source, "omw"), inArray(terms.source_id, sourceIds)));
|
||||||
|
|
||||||
|
const sourceIdToTermId = new Map(termRows.map((r) => [r.source_id, r.id]));
|
||||||
|
|
||||||
|
// 3. Build and insert translation rows
|
||||||
|
const translationRows = batch.flatMap((r) => {
|
||||||
|
const termId = sourceIdToTermId.get(r.source_id);
|
||||||
|
if (!termId) return [];
|
||||||
|
return Object.entries(r.translations).flatMap(([lang, lemmas]) =>
|
||||||
|
(lemmas ?? []).map((text) => ({
|
||||||
|
term_id: termId,
|
||||||
|
language_code: lang as LanguageCode,
|
||||||
|
text,
|
||||||
|
})),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
for (const tBatch of chunk(translationRows, BATCH_SIZE)) {
|
||||||
|
const inserted = await db
|
||||||
|
.insert(translations)
|
||||||
|
.values(tBatch)
|
||||||
|
.onConflictDoNothing()
|
||||||
|
.returning({ id: translations.id });
|
||||||
|
|
||||||
|
stats.translations.inserted += inserted.length;
|
||||||
|
stats.translations.skipped += tBatch.length - inserted.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Build and insert gloss rows
|
||||||
|
const glossRows = batch.flatMap((r) => {
|
||||||
|
const termId = sourceIdToTermId.get(r.source_id);
|
||||||
|
if (!termId) return [];
|
||||||
|
return Object.entries(r.glosses ?? {}).flatMap(([lang, texts]) =>
|
||||||
|
(texts ?? []).map((text) => ({
|
||||||
|
term_id: termId,
|
||||||
|
language_code: lang as LanguageCode,
|
||||||
|
text,
|
||||||
|
})),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
for (const gBatch of chunk(glossRows, BATCH_SIZE)) {
|
||||||
|
const inserted = await db
|
||||||
|
.insert(term_glosses)
|
||||||
|
.values(gBatch)
|
||||||
|
.onConflictDoNothing()
|
||||||
|
.returning({ id: term_glosses.id });
|
||||||
|
|
||||||
|
stats.glosses.inserted += inserted.length;
|
||||||
|
stats.glosses.skipped += gBatch.length - inserted.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ────────────────────────────────────────────────────────────
|
||||||
|
// Main
|
||||||
|
// ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const main = async () => {
|
||||||
|
console.log("\n##########################################");
|
||||||
|
console.log("Glossa — OMW seed");
|
||||||
|
console.log("##########################################\n");
|
||||||
|
|
||||||
|
// One file per POS — names are derived from SUPPORTED_POS so adding a new
|
||||||
|
// constant value automatically picks up a new file on the next run.
|
||||||
|
const posToFile = Object.fromEntries(
|
||||||
|
SUPPORTED_POS.map((pos) => [pos, `omw-${pos}.json`]),
|
||||||
|
) as Record<POS, string>;
|
||||||
|
|
||||||
|
for (const pos of SUPPORTED_POS) {
|
||||||
|
const filename = posToFile[pos];
|
||||||
|
const filepath = dataDir + filename;
|
||||||
|
|
||||||
|
console.log(`📄 ${filename}`);
|
||||||
|
|
||||||
|
let records: SynsetRecord[];
|
||||||
|
try {
|
||||||
|
const raw = await fs.readFile(filepath, "utf8");
|
||||||
|
records = JSON.parse(raw) as SynsetRecord[];
|
||||||
|
} catch (e) {
|
||||||
|
console.warn(
|
||||||
|
` ⚠️ Skipping — could not read file: ${(e as Error).message}\n`,
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(` Loaded ${fmt(records.length)} synsets`);
|
||||||
|
|
||||||
|
const batches = chunk(records, BATCH_SIZE);
|
||||||
|
|
||||||
|
for (const [i, batch] of batches.entries()) {
|
||||||
|
// Progress every 5 000 synsets
|
||||||
|
if (i > 0 && i % 10 === 0) {
|
||||||
|
const processed = i * BATCH_SIZE;
|
||||||
|
console.log(` ⏳ ${fmt(processed)} / ${fmt(records.length)}`);
|
||||||
|
}
|
||||||
|
await processBatch(batch);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(` ✅ Done\n`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Summary ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
console.log("##########################################");
|
||||||
|
console.log("Summary");
|
||||||
|
console.log("##########################################\n");
|
||||||
|
|
||||||
|
const pad = (label: string) => label.padEnd(14);
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
`${pad("Terms:")}inserted ${fmt(stats.terms.inserted)}, skipped ${fmt(stats.terms.skipped)}`,
|
||||||
|
);
|
||||||
|
console.log(
|
||||||
|
`${pad("Translations:")}inserted ${fmt(stats.translations.inserted)}, skipped ${fmt(stats.translations.skipped)}`,
|
||||||
|
);
|
||||||
|
console.log(
|
||||||
|
`${pad("Glosses:")}inserted ${fmt(stats.glosses.inserted)}, skipped ${fmt(stats.glosses.skipped)}`,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Query actual DB totals — insert-based counters show 0 on re-runs.
|
||||||
|
console.log("\nCoverage per language (total in DB):");
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const [tRow] = await db
|
||||||
|
.select({ n: count() })
|
||||||
|
.from(translations)
|
||||||
|
.where(eq(translations.language_code, lang));
|
||||||
|
const [gRow] = await db
|
||||||
|
.select({ n: count() })
|
||||||
|
.from(term_glosses)
|
||||||
|
.where(eq(term_glosses.language_code, lang));
|
||||||
|
console.log(
|
||||||
|
` ${lang}: ${fmt(tRow?.n ?? 0)} translations, ${fmt(gRow?.n ?? 0)} glosses`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error(err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
Loading…
Add table
Add a link
Reference in a new issue