updating seeding pipeline
This commit is contained in:
parent
c49c2fe2c3
commit
dfeb6a4cb0
2 changed files with 149 additions and 203 deletions
|
|
@ -1,203 +0,0 @@
|
|||
import fs from "node:fs/promises";
|
||||
import { eq } from "drizzle-orm";
|
||||
|
||||
import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@glossa/shared";
|
||||
import { db } from "@glossa/db";
|
||||
import { terms, translations } from "@glossa/db/schema";
|
||||
|
||||
// the following generate unions of the imported const arrays
|
||||
type POS = (typeof SUPPORTED_POS)[number];
|
||||
type LANGUAGE_CODE = (typeof SUPPORTED_LANGUAGE_CODES)[number];
|
||||
|
||||
type Synset = {
|
||||
synset_id: string;
|
||||
pos: POS;
|
||||
translations: Partial<Record<LANGUAGE_CODE, string[]>>;
|
||||
};
|
||||
|
||||
type FileName = {
|
||||
sourceLang: LANGUAGE_CODE;
|
||||
targetLang: LANGUAGE_CODE;
|
||||
pos: POS;
|
||||
};
|
||||
|
||||
const dataDir = "./src/data/datafiles/";
|
||||
|
||||
const parseFilename = (filename: string): FileName => {
|
||||
const parts = filename.replace(".json", "").split("-");
|
||||
if (parts.length !== 3)
|
||||
throw new Error(
|
||||
`Invalid filename format: ${filename}. Expected: sourcelang-targetlang-pos.json`,
|
||||
);
|
||||
const [sourceLang, targetLang, pos] = parts;
|
||||
if (!SUPPORTED_LANGUAGE_CODES.includes(sourceLang as LANGUAGE_CODE))
|
||||
throw new Error(`Unsupported language code: ${sourceLang}`);
|
||||
if (!SUPPORTED_LANGUAGE_CODES.includes(targetLang as LANGUAGE_CODE))
|
||||
throw new Error(`Unsupported language code: ${targetLang}`);
|
||||
if (!SUPPORTED_POS.includes(pos as POS))
|
||||
throw new Error(`Unsupported POS: ${pos}`);
|
||||
return {
|
||||
sourceLang: sourceLang as LANGUAGE_CODE,
|
||||
targetLang: targetLang as LANGUAGE_CODE,
|
||||
pos: pos as POS,
|
||||
};
|
||||
};
|
||||
|
||||
const readFromJsonFile = async (filepath: string): Promise<Synset[]> => {
|
||||
const data = await fs.readFile(filepath, "utf8");
|
||||
const parsed = JSON.parse(data);
|
||||
if (!Array.isArray(parsed)) throw new Error("Expected a JSON array");
|
||||
return parsed as Synset[];
|
||||
};
|
||||
|
||||
const uploadSynsetToDB = async (
|
||||
synset: Synset,
|
||||
_fileInfo: FileName,
|
||||
): Promise<{ termInserted: boolean; translationsInserted: number }> => {
|
||||
// 1. Try to insert the term — skip if synset_id already exists
|
||||
const inserted = await db
|
||||
.insert(terms)
|
||||
.values({ synset_id: synset.synset_id, pos: synset.pos })
|
||||
.onConflictDoNothing()
|
||||
.returning({ id: terms.id });
|
||||
|
||||
let termId: string;
|
||||
let termInserted: boolean;
|
||||
|
||||
if (inserted.length > 0) {
|
||||
termId = inserted[0]!.id;
|
||||
termInserted = true;
|
||||
} else {
|
||||
// Term already exists — fetch its real DB id for the FK
|
||||
const [existing] = await db
|
||||
.select({ id: terms.id })
|
||||
.from(terms)
|
||||
.where(eq(terms.synset_id, synset.synset_id))
|
||||
.limit(1);
|
||||
if (!existing)
|
||||
throw new Error(`Term not found after conflict: ${synset.synset_id}`);
|
||||
termId = existing.id;
|
||||
termInserted = false;
|
||||
}
|
||||
|
||||
// 2. Build translation rows and upsert — skip duplicates silently
|
||||
const translationRows = Object.entries(synset.translations).flatMap(
|
||||
([lang, lemmas]) =>
|
||||
lemmas!.map((lemma) => ({
|
||||
id: crypto.randomUUID(),
|
||||
term_id: termId,
|
||||
language_code: lang as LANGUAGE_CODE,
|
||||
text: lemma,
|
||||
})),
|
||||
);
|
||||
|
||||
if (translationRows.length === 0) {
|
||||
return { termInserted, translationsInserted: 0 };
|
||||
}
|
||||
|
||||
const result = await db
|
||||
.insert(translations)
|
||||
.values(translationRows)
|
||||
.onConflictDoNothing()
|
||||
.returning({ id: translations.id });
|
||||
|
||||
return { termInserted, translationsInserted: result.length };
|
||||
};
|
||||
|
||||
const main = async () => {
|
||||
// step 1: discovering files
|
||||
console.log("\n");
|
||||
console.log("\n");
|
||||
console.log("##########################################");
|
||||
console.log("step 1: discovering files");
|
||||
console.log("##########################################");
|
||||
|
||||
console.log("🔍 Scanning datafiles directory...");
|
||||
const allFiles = await fs.readdir(dataDir);
|
||||
const jsonFiles = allFiles.filter((f) => f.endsWith(".json"));
|
||||
|
||||
if (jsonFiles.length === 0) {
|
||||
console.warn("⚠️ No JSON files found in", dataDir);
|
||||
return;
|
||||
}
|
||||
console.log(`📁 Found ${jsonFiles.length} file(s)\n`);
|
||||
|
||||
// step 2: validating filenames
|
||||
console.log("\n");
|
||||
console.log("\n");
|
||||
console.log("##########################################");
|
||||
console.log("step 2: validating filenames");
|
||||
console.log("##########################################");
|
||||
const validFiles: { filename: string; fileInfo: FileName }[] = [];
|
||||
for (const filename of jsonFiles) {
|
||||
try {
|
||||
const fileInfo = parseFilename(filename);
|
||||
validFiles.push({ filename, fileInfo });
|
||||
console.log(
|
||||
` ✅ ${filename} — ${fileInfo.sourceLang} → ${fileInfo.targetLang} (${fileInfo.pos})`,
|
||||
);
|
||||
} catch (e) {
|
||||
console.warn(` ⚠️ Skipping ${filename}: ${(e as Error).message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (validFiles.length === 0) {
|
||||
console.error("❌ No valid files to process. Exiting.");
|
||||
return;
|
||||
}
|
||||
|
||||
// step 3: processing each file
|
||||
console.log("\n");
|
||||
console.log("\n");
|
||||
console.log("##########################################");
|
||||
console.log("step 3: processing each file");
|
||||
console.log("##########################################");
|
||||
let totalTermsInserted = 0;
|
||||
let totalTranslationsInserted = 0;
|
||||
|
||||
for (const [i, { filename, fileInfo }] of validFiles.entries()) {
|
||||
const prefix = `[${i + 1}/${validFiles.length}]`;
|
||||
|
||||
console.log(`\n${prefix} 📄 ${filename}`);
|
||||
|
||||
const synsets = await readFromJsonFile(dataDir + filename);
|
||||
console.log(`${prefix} Loaded ${synsets.length} synsets`);
|
||||
|
||||
let fileTermsInserted = 0;
|
||||
let fileTranslationsInserted = 0;
|
||||
|
||||
for (const [j, synset] of synsets.entries()) {
|
||||
if (j > 0 && j % 500 === 0) {
|
||||
console.log(`${prefix} ⏳ ${j}/${synsets.length} synsets processed...`);
|
||||
}
|
||||
|
||||
const { termInserted, translationsInserted } = await uploadSynsetToDB(
|
||||
synset,
|
||||
fileInfo,
|
||||
);
|
||||
if (termInserted) fileTermsInserted++;
|
||||
fileTranslationsInserted += translationsInserted;
|
||||
}
|
||||
|
||||
console.log(
|
||||
`${prefix} ✅ Done — ${fileTermsInserted} new terms, ${fileTranslationsInserted} new translations`,
|
||||
);
|
||||
totalTermsInserted += fileTermsInserted;
|
||||
totalTranslationsInserted += fileTranslationsInserted;
|
||||
}
|
||||
|
||||
// step 4: Final summary
|
||||
console.log("\n");
|
||||
console.log("\n");
|
||||
console.log("##########################################");
|
||||
console.log("step 4: final summary");
|
||||
console.log("##########################################");
|
||||
console.log(`\n🎉 Seeding complete!`);
|
||||
console.log(` Terms inserted: ${totalTermsInserted}`);
|
||||
console.log(` Translations inserted: ${totalTranslationsInserted}`);
|
||||
};
|
||||
|
||||
main().catch((error) => {
|
||||
console.error(error);
|
||||
process.exit(1);
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue