archiving old seeding scripts, removing them from package.json scripts

This commit is contained in:
lila 2026-04-20 10:10:28 +02:00
parent a3d19d36f6
commit e718d188d5
10 changed files with 7 additions and 1057435 deletions

View file

@ -6,9 +6,7 @@
"scripts": {
"build": "tsc",
"generate": "drizzle-kit generate",
"migrate": "drizzle-kit migrate",
"db:seed": "npx tsx src/seeding-datafiles.ts",
"db:build-deck": "npx tsx src/generating-deck.ts"
"migrate": "drizzle-kit migrate"
},
"dependencies": {
"@lila/shared": "workspace:*",

View file

@ -1,183 +0,0 @@
/*
This script performs a cross-reference check between two specific data sets:
- The "Target" List: It reads the {language}-merged.json file (e.g., en-merged.json). This represents the vocabulary you want to have CEFR levels for.
- The "Source of Truth": It queries your Database (translations table). This represents the vocabulary you currently have in your app.
What it calculates:
It tells you: "Of all the words in my merged JSON file, how many actually exist in my database?"
Matched: The word from the JSON file was found in the DB. (Ready for enrichment).
Unmatched: The word from the JSON file was not found in the DB. (These will be skipped during enrichment).
*/
import fs from "node:fs/promises";
import { eq } from "drizzle-orm";
import {
SUPPORTED_LANGUAGE_CODES,
SUPPORTED_POS,
CEFR_LEVELS,
DIFFICULTY_LEVELS,
} from "@lila/shared";
import { db } from "@lila/db";
import { terms, translations } from "@lila/db/schema";
type POS = (typeof SUPPORTED_POS)[number];
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
type CEFRLevel = (typeof CEFR_LEVELS)[number];
type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
type MergedRecord = {
word: string;
pos: POS;
cefr: CEFRLevel;
difficulty: Difficulty;
sources: string[];
};
type CoverageStats = {
total: number;
matched: number;
unmatched: number;
byCefr: Record<CEFRLevel, { total: number; matched: number }>;
byDifficulty: Record<Difficulty, { total: number; matched: number }>;
unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }>;
};
const dataDir = "./src/data/";
async function checkCoverage(language: LanguageCode): Promise<void> {
const filename = `${language}-merged.json`;
const filepath = dataDir + filename;
console.log(`\n📄 Checking ${filename}...`);
// Load merged data
let records: MergedRecord[];
try {
const raw = await fs.readFile(filepath, "utf8");
records = JSON.parse(raw) as MergedRecord[];
} catch (e) {
console.warn(` ⚠️ Could not read file: ${(e as Error).message}`);
return;
}
console.log(` Loaded ${records.length.toLocaleString("en-US")} entries`);
// Initialize stats
const stats: CoverageStats = {
total: records.length,
matched: 0,
unmatched: 0,
byCefr: {} as Record<CEFRLevel, { total: number; matched: number }>,
byDifficulty: {} as Record<Difficulty, { total: number; matched: number }>,
unmatchedWords: [],
};
for (const level of CEFR_LEVELS)
stats.byCefr[level] = { total: 0, matched: 0 };
for (const diff of DIFFICULTY_LEVELS)
stats.byDifficulty[diff] = { total: 0, matched: 0 };
// ── BATCHED LOOKUP: Build a Set of existing (word, pos) pairs in DB ──
console.log(` 🔍 Querying database for existing translations...`);
// Get all existing translations for this language + POS combo
const existingRows = await db
.select({ text: translations.text, pos: terms.pos })
.from(translations)
.innerJoin(terms, eq(translations.term_id, terms.id))
.where(eq(translations.language_code, language));
// Create a Set for O(1) lookup: "word|pos" -> true
const existingSet = new Set(
existingRows.map((row) => `${row.text.toLowerCase()}|${row.pos}`),
);
// ── Process records against the in-memory Set ──
for (const record of records) {
stats.byCefr[record.cefr].total++;
stats.byDifficulty[record.difficulty].total++;
const key = `${record.word.toLowerCase()}|${record.pos}`;
if (existingSet.has(key)) {
stats.matched++;
stats.byCefr[record.cefr].matched++;
stats.byDifficulty[record.difficulty].matched++;
} else {
stats.unmatched++;
if (stats.unmatchedWords.length < 20) {
stats.unmatchedWords.push({
word: record.word,
pos: record.pos,
cefr: record.cefr,
});
}
}
}
// ── Print results (same as your draft) ──
console.log(`\n📊 Coverage for ${language}:`);
console.log(` Total entries: ${stats.total.toLocaleString("en-US")}`);
console.log(
` Matched in DB: ${stats.matched.toLocaleString("en-US")} (${((stats.matched / stats.total) * 100).toFixed(1)}%)`,
);
console.log(
` Unmatched: ${stats.unmatched.toLocaleString("en-US")} (${((stats.unmatched / stats.total) * 100).toFixed(1)}%)`,
);
console.log(`\n By CEFR level:`);
for (const level of CEFR_LEVELS) {
const { total, matched } = stats.byCefr[level];
if (total > 0) {
const pct = ((matched / total) * 100).toFixed(1);
console.log(
` ${level}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
);
}
}
console.log(`\n By difficulty:`);
for (const diff of DIFFICULTY_LEVELS) {
const { total, matched } = stats.byDifficulty[diff];
if (total > 0) {
const pct = ((matched / total) * 100).toFixed(1);
console.log(
` ${diff}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
);
}
}
if (stats.unmatchedWords.length > 0) {
console.log(`\n⚠ Sample unmatched words (first 20):`);
for (const { word, pos, cefr } of stats.unmatchedWords) {
console.log(` "${word}" (${pos}, ${cefr})`);
}
if (stats.unmatched > 20) {
console.log(` ... and ${stats.unmatched - 20} more`);
}
}
}
const main = async () => {
console.log("##########################################");
console.log("lila — CEFR Coverage Check");
console.log("##########################################");
for (const language of SUPPORTED_LANGUAGE_CODES) {
await checkCoverage(language);
}
console.log("\n##########################################");
console.log("Done");
console.log("##########################################");
};
main().catch((err) => {
console.error(err);
process.exit(1);
});

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,211 +0,0 @@
import fs from "node:fs/promises";
import { db } from "@lila/db";
import { translations, terms, decks, deck_terms } from "@lila/db/schema";
import { inArray, and, eq, ne, countDistinct } from "drizzle-orm";
type DbOrTx = Parameters<Parameters<typeof db.transaction>[0]>[0];
const config = {
pathToWordlist: "./src/data/wordlists/top1000englishnouns",
deckName: "top english nouns",
deckDescription: "Most frequently used English nouns for vocabulary practice",
sourceLanguage: "en",
sourcePOS: "noun",
} as const;
const readWordList = async () => {
const raw = await fs.readFile(config.pathToWordlist, "utf8");
const words = [
...new Set(
raw
.split("\n")
.map((w) => w.trim().toLowerCase())
.filter(Boolean),
),
];
return words;
};
const resolveSourceTerms = async (words: string[]) => {
const rows = await db
.select({ text: translations.text, termId: translations.term_id })
.from(translations)
.innerJoin(terms, eq(translations.term_id, terms.id))
.where(
and(
inArray(translations.text, words),
eq(translations.language_code, config.sourceLanguage),
eq(terms.pos, config.sourcePOS),
),
);
const wordToTermIds = new Map<string, string[]>();
for (const row of rows) {
const word = row.text.toLowerCase();
if (!wordToTermIds.has(word)) {
wordToTermIds.set(word, []);
}
wordToTermIds.get(word)!.push(row.termId);
}
// Deduplicate: multiple words can map to the same term ID (e.g. via synonyms)
const termIds = [...new Set(Array.from(wordToTermIds.values()).flat())];
const missingWords = words.filter((w) => !wordToTermIds.has(w));
return { termIds, missingWords };
};
const writeMissingWordsToFile = async (missingWords: string[]) => {
const outputPath = `${config.pathToWordlist}-missing`;
await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
};
const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
const coverage = await db
.select({
language: translations.language_code,
coveredCount: countDistinct(translations.term_id),
})
.from(translations)
.where(
and(
inArray(translations.term_id, termIds),
ne(translations.language_code, sourceLanguage),
),
)
.groupBy(translations.language_code);
const validatedLanguages = coverage
.filter((row) => Number(row.coveredCount) === termIds.length)
.map((row) => row.language);
return { coverage, validatedLanguages };
};
const findExistingDeck = async (tx: DbOrTx) => {
const existing = await tx
.select({ id: decks.id, validatedForLanguages: decks.validated_languages })
.from(decks)
.where(
and(
eq(decks.name, config.deckName),
eq(decks.source_language, config.sourceLanguage),
),
);
return existing[0] ?? null;
};
const createDeck = async (tx: DbOrTx, validatedLanguages: string[]) => {
const result = await tx
.insert(decks)
.values({
name: config.deckName,
description: config.deckDescription,
source_language: config.sourceLanguage,
validated_languages: validatedLanguages,
type: "core",
})
.returning({ id: decks.id });
const created = result[0];
if (!created) throw new Error("Failed to create deck: no row returned");
return created.id;
};
const addTermsToDeck = async (
tx: DbOrTx,
deckId: string,
termIds: string[],
): Promise<number> => {
if (termIds.length === 0) return 0;
await tx
.insert(deck_terms)
.values(termIds.map((termId) => ({ deck_id: deckId, term_id: termId })))
.onConflictDoNothing();
return termIds.length;
};
const updateValidatedLanguages = async (
tx: DbOrTx,
deckId: string,
validatedLanguages: string[],
): Promise<void> => {
await tx
.update(decks)
.set({ validated_languages: validatedLanguages })
.where(eq(decks.id, deckId));
};
const main = async () => {
console.log("📖 Reading word list...");
const sourceWords = await readWordList();
console.log(` ${sourceWords.length} words loaded\n`);
console.log("🔍 Checking against database...");
const { termIds, missingWords } = await resolveSourceTerms(sourceWords);
console.log(` ${termIds.length} terms found`);
console.log(` ${missingWords.length} words not found in DB\n`);
console.log("🖊️ Writing missing words to file...\n");
await writeMissingWordsToFile(missingWords);
console.log("✅ Validating languages...");
const { coverage, validatedLanguages } = await validateLanguages(
config.sourceLanguage,
termIds,
);
console.log(
` Validated languages: ${JSON.stringify(validatedLanguages)}\n`,
);
console.log("🔬 Language coverage breakdown...");
for (const row of coverage) {
console.log(
` ${row.language}: ${row.coveredCount} / ${termIds.length} terms covered`,
);
}
console.log("🃏 Looking for existing deck...");
const addedCount = await db.transaction(async (tx) => {
const existingDeck = await findExistingDeck(tx);
const deckId = existingDeck
? existingDeck.id
: await createDeck(tx, validatedLanguages);
const addedCount = await addTermsToDeck(tx, deckId, termIds);
const currentLanguages = existingDeck?.validatedForLanguages ?? [];
const hasChanged =
JSON.stringify([...currentLanguages].sort()) !==
JSON.stringify([...validatedLanguages].sort());
if (hasChanged) {
await updateValidatedLanguages(tx, deckId, validatedLanguages);
}
return addedCount;
});
const alreadyPresentCount = termIds.length - addedCount;
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
console.log("📊 Summary");
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
console.log(` Words loaded from wordlist : ${sourceWords.length}`);
console.log(
` Words matched in DB : ${sourceWords.length - missingWords.length}`,
);
console.log(` Words not found in DB : ${missingWords.length}`);
console.log(` Term IDs resolved : ${termIds.length}`);
console.log(` Terms added to deck : ${addedCount}`);
console.log(` Terms already in deck : ${alreadyPresentCount}`);
console.log(
` Validated languages : ${validatedLanguages.length > 0 ? validatedLanguages.join(", ") : "none"}`,
);
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
};
main().catch((error) => {
console.error(error);
process.exit(1);
});

View file

@ -1,148 +0,0 @@
import fs from "node:fs/promises";
import { eq, inArray } from "drizzle-orm";
import {
SUPPORTED_LANGUAGE_CODES,
SUPPORTED_POS,
CEFR_LEVELS,
DIFFICULTY_LEVELS,
} from "@lila/shared";
import { db } from "@lila/db";
import { translations, terms } from "@lila/db/schema";
type POS = (typeof SUPPORTED_POS)[number];
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
type CEFRLevel = (typeof CEFR_LEVELS)[number];
type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
type MergedRecord = {
word: string;
pos: POS;
cefr: CEFRLevel;
difficulty: Difficulty;
sources: string[];
};
const dataDir = "./src/data/";
const BATCH_SIZE = 500;
// ────────────────────────────────────────────────────────────
// Helpers
// ────────────────────────────────────────────────────────────
function chunk<T>(arr: T[], size: number): T[][] {
const out: T[][] = [];
for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
return out;
}
function fmt(n: number): string {
return n.toLocaleString("en-US");
}
// ────────────────────────────────────────────────────────────
// Enrichment per language
// ────────────────────────────────────────────────────────────
async function enrichLanguage(language: LanguageCode): Promise<void> {
const filename = `${language}-merged.json`;
const filepath = dataDir + filename;
console.log(`\n📝 Enriching ${filename}...`);
let records: MergedRecord[];
try {
const raw = await fs.readFile(filepath, "utf8");
records = JSON.parse(raw) as MergedRecord[];
} catch (e) {
console.warn(` ⚠️ Could not read file: ${(e as Error).message}`);
return;
}
console.log(` Loaded ${fmt(records.length)} entries`);
// 1. Bulk fetch existing translations for this language
console.log(` 🔍 Fetching existing translations from DB...`);
const existingTranslations = await db
.select({ id: translations.id, text: translations.text, pos: terms.pos })
.from(translations)
.innerJoin(terms, eq(translations.term_id, terms.id))
.where(eq(translations.language_code, language));
// 2. Build lookup map: "lowercase_word|pos" -> translation IDs
const translationMap = new Map<string, string[]>();
for (const t of existingTranslations) {
const key = `${t.text.toLowerCase()}|${t.pos}`;
if (!translationMap.has(key)) translationMap.set(key, []);
translationMap.get(key)!.push(t.id);
}
// 3. Match records to DB IDs and group by target (cefr, difficulty)
const updatesByValue = new Map<string, string[]>();
const unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }> = [];
for (const rec of records) {
const key = `${rec.word.toLowerCase()}|${rec.pos}`;
const ids = translationMap.get(key);
if (ids && ids.length > 0) {
const valueKey = `${rec.cefr}|${rec.difficulty}`;
if (!updatesByValue.has(valueKey)) updatesByValue.set(valueKey, []);
updatesByValue.get(valueKey)!.push(...ids);
} else {
unmatchedWords.push({ word: rec.word, pos: rec.pos, cefr: rec.cefr });
}
}
// 4. Batch updates grouped by (cefr, difficulty)
let totalUpdated = 0;
for (const [valueKey, ids] of updatesByValue.entries()) {
const [cefr, difficulty] = valueKey.split("|") as [CEFRLevel, Difficulty];
const uniqueIds = [...new Set(ids)]; // Deduplicate synonyms/duplicates
for (const idBatch of chunk(uniqueIds, BATCH_SIZE)) {
await db
.update(translations)
.set({ cefr_level: cefr, difficulty })
.where(inArray(translations.id, idBatch));
totalUpdated += idBatch.length;
}
}
// 5. Summary
console.log(`\n ✅ Updated ${fmt(totalUpdated)} translations`);
console.log(` ⚠️ Unmatched: ${fmt(unmatchedWords.length)}`);
if (unmatchedWords.length > 0) {
console.log(`\n Sample unmatched words (first 20):`);
for (const { word, pos, cefr } of unmatchedWords.slice(0, 20)) {
console.log(` "${word}" (${pos}, ${cefr})`);
}
if (unmatchedWords.length > 20) {
console.log(` ... and ${fmt(unmatchedWords.length - 20)} more`);
}
}
}
// ────────────────────────────────────────────────────────────
// Main
// ────────────────────────────────────────────────────────────
const main = async () => {
console.log("##########################################");
console.log("lila — CEFR Enrichment");
console.log("##########################################\n");
for (const lang of SUPPORTED_LANGUAGE_CODES) {
await enrichLanguage(lang);
}
console.log("\n##########################################");
console.log("Done");
console.log("##########################################");
};
main().catch((err) => {
console.error(err);
process.exit(1);
});

View file

@ -1,212 +0,0 @@
import fs from "node:fs/promises";
import { and, count, eq, inArray } from "drizzle-orm";
import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@lila/shared";
import { db } from "@lila/db";
import { terms, translations, term_glosses } from "@lila/db/schema";
type POS = (typeof SUPPORTED_POS)[number];
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
type SynsetRecord = {
source_id: string;
pos: POS;
translations: Partial<Record<LanguageCode, string[]>>;
glosses: Partial<Record<LanguageCode, string[]>>;
};
const dataDir = "./src/data/";
const BATCH_SIZE = 500;
// ────────────────────────────────────────────────────────────
// Helpers
// ────────────────────────────────────────────────────────────
function chunk<T>(arr: T[], size: number): T[][] {
const out: T[][] = [];
for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
return out;
}
function fmt(n: number): string {
return n.toLocaleString("en-US");
}
// ────────────────────────────────────────────────────────────
// Stats
// ────────────────────────────────────────────────────────────
const stats = {
terms: { inserted: 0, skipped: 0 },
translations: { inserted: 0, skipped: 0 },
glosses: { inserted: 0, skipped: 0 },
};
// ────────────────────────────────────────────────────────────
// Per-batch processing
// ────────────────────────────────────────────────────────────
async function processBatch(batch: SynsetRecord[]): Promise<void> {
// 1. Insert terms — idempotency key: (source, source_id)
const termValues = batch.map((r) => ({
source: "omw" as const,
source_id: r.source_id,
pos: r.pos,
}));
const insertedTerms = await db
.insert(terms)
.values(termValues)
.onConflictDoNothing()
.returning({ id: terms.id });
stats.terms.inserted += insertedTerms.length;
stats.terms.skipped += batch.length - insertedTerms.length;
// 2. Resolve UUIDs for every source_id in this batch (new + pre-existing).
// We can't rely solely on the .returning() above because onConflictDoNothing
// returns nothing for rows that already existed.
const sourceIds = batch.map((r) => r.source_id);
const termRows = await db
.select({ id: terms.id, source_id: terms.source_id })
.from(terms)
.where(and(eq(terms.source, "omw"), inArray(terms.source_id, sourceIds)));
const sourceIdToTermId = new Map(termRows.map((r) => [r.source_id, r.id]));
// 3. Build and insert translation rows
const translationRows = batch.flatMap((r) => {
const termId = sourceIdToTermId.get(r.source_id);
if (!termId) return [];
return Object.entries(r.translations).flatMap(([lang, lemmas]) =>
(lemmas ?? []).map((text) => ({
term_id: termId,
language_code: lang as LanguageCode,
text,
})),
);
});
for (const tBatch of chunk(translationRows, BATCH_SIZE)) {
const inserted = await db
.insert(translations)
.values(tBatch)
.onConflictDoNothing()
.returning({ id: translations.id });
stats.translations.inserted += inserted.length;
stats.translations.skipped += tBatch.length - inserted.length;
}
// 4. Build and insert gloss rows
const glossRows = batch.flatMap((r) => {
const termId = sourceIdToTermId.get(r.source_id);
if (!termId) return [];
return Object.entries(r.glosses ?? {}).flatMap(([lang, texts]) =>
(texts ?? []).map((text) => ({
term_id: termId,
language_code: lang as LanguageCode,
text,
})),
);
});
for (const gBatch of chunk(glossRows, BATCH_SIZE)) {
const inserted = await db
.insert(term_glosses)
.values(gBatch)
.onConflictDoNothing()
.returning({ id: term_glosses.id });
stats.glosses.inserted += inserted.length;
stats.glosses.skipped += gBatch.length - inserted.length;
}
}
// ────────────────────────────────────────────────────────────
// Main
// ────────────────────────────────────────────────────────────
const main = async () => {
console.log("\n##########################################");
console.log("lila — OMW seed");
console.log("##########################################\n");
// One file per POS — names are derived from SUPPORTED_POS so adding a new
// constant value automatically picks up a new file on the next run.
const posToFile = Object.fromEntries(
SUPPORTED_POS.map((pos) => [pos, `omw-${pos}.json`]),
) as Record<POS, string>;
for (const pos of SUPPORTED_POS) {
const filename = posToFile[pos];
const filepath = dataDir + filename;
console.log(`📄 ${filename}`);
let records: SynsetRecord[];
try {
const raw = await fs.readFile(filepath, "utf8");
records = JSON.parse(raw) as SynsetRecord[];
} catch (e) {
console.warn(
` ⚠️ Skipping — could not read file: ${(e as Error).message}\n`,
);
continue;
}
console.log(` Loaded ${fmt(records.length)} synsets`);
const batches = chunk(records, BATCH_SIZE);
for (const [i, batch] of batches.entries()) {
// Progress every 5 000 synsets
if (i > 0 && i % 10 === 0) {
const processed = i * BATCH_SIZE;
console.log(`${fmt(processed)} / ${fmt(records.length)}`);
}
await processBatch(batch);
}
console.log(` ✅ Done\n`);
}
// ── Summary ───────────────────────────────────────────────
console.log("##########################################");
console.log("Summary");
console.log("##########################################\n");
const pad = (label: string) => label.padEnd(14);
console.log(
`${pad("Terms:")}inserted ${fmt(stats.terms.inserted)}, skipped ${fmt(stats.terms.skipped)}`,
);
console.log(
`${pad("Translations:")}inserted ${fmt(stats.translations.inserted)}, skipped ${fmt(stats.translations.skipped)}`,
);
console.log(
`${pad("Glosses:")}inserted ${fmt(stats.glosses.inserted)}, skipped ${fmt(stats.glosses.skipped)}`,
);
// Query actual DB totals — insert-based counters show 0 on re-runs.
console.log("\nCoverage per language (total in DB):");
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const [tRow] = await db
.select({ n: count() })
.from(translations)
.where(eq(translations.language_code, lang));
const [gRow] = await db
.select({ n: count() })
.from(term_glosses)
.where(eq(term_glosses.language_code, lang));
console.log(
` ${lang}: ${fmt(tRow?.n ?? 0)} translations, ${fmt(gRow?.n ?? 0)} glosses`,
);
}
};
main().catch((err) => {
console.error(err);
process.exit(1);
});

View file

@ -5,7 +5,11 @@
"moduleResolution": "NodeNext",
"outDir": "./dist",
"resolveJsonModule": true,
"types": ["vitest/globals"]
"types": ["vitest/globals"],
},
"include": ["src", "vitest.config.ts"]
"include": [
"src",
"vitest.config.ts",
"../../data-pipeline/archive/packages-db-src-old-seeding-scripts/data",
],
}