archiving old seeding scripts, removing them from package.json scripts

2026-04-20 10:10:28 +02:00 · 2026-04-20 10:10:28 +02:00 · e718d188d5
commit e718d188d5
parent a3d19d36f6
10 changed files with 7 additions and 1057435 deletions
--- a/packages/db/package.json
+++ b/packages/db/package.json
@ -6,9 +6,7 @@
  "scripts": {
    "build": "tsc",
    "generate": "drizzle-kit generate",
-    "migrate": "drizzle-kit migrate",
-    "db:seed": "npx tsx src/seeding-datafiles.ts",
-    "db:build-deck": "npx tsx src/generating-deck.ts"
+    "migrate": "drizzle-kit migrate"
  },
  "dependencies": {
    "@lila/shared": "workspace:*",
--- a/packages/db/src/checking-cefr-coverage.ts
+++ b/packages/db/src/checking-cefr-coverage.ts
@ -1,183 +0,0 @@
-/*
-
-This script performs a cross-reference check between two specific data sets:
-
-    - The "Target" List: It reads the {language}-merged.json file (e.g., en-merged.json). This represents the vocabulary you want to have CEFR levels for.
-    - The "Source of Truth": It queries your Database (translations table). This represents the vocabulary you currently have in your app.
-
-What it calculates:
-It tells you: "Of all the words in my merged JSON file, how many actually exist in my database?"
-
-    Matched: The word from the JSON file was found in the DB. (Ready for enrichment).
-    Unmatched: The word from the JSON file was not found in the DB. (These will be skipped during enrichment).
-
-*/
-
-import fs from "node:fs/promises";
-import { eq } from "drizzle-orm";
-
-import {
-  SUPPORTED_LANGUAGE_CODES,
-  SUPPORTED_POS,
-  CEFR_LEVELS,
-  DIFFICULTY_LEVELS,
-} from "@lila/shared";
-import { db } from "@lila/db";
-import { terms, translations } from "@lila/db/schema";
-
-type POS = (typeof SUPPORTED_POS)[number];
-type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
-type CEFRLevel = (typeof CEFR_LEVELS)[number];
-type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
-
-type MergedRecord = {
-  word: string;
-  pos: POS;
-  cefr: CEFRLevel;
-  difficulty: Difficulty;
-  sources: string[];
-};
-
-type CoverageStats = {
-  total: number;
-  matched: number;
-  unmatched: number;
-  byCefr: Record<CEFRLevel, { total: number; matched: number }>;
-  byDifficulty: Record<Difficulty, { total: number; matched: number }>;
-  unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }>;
-};
-
-const dataDir = "./src/data/";
-
-async function checkCoverage(language: LanguageCode): Promise<void> {
-  const filename = `${language}-merged.json`;
-  const filepath = dataDir + filename;
-
-  console.log(`\n📄 Checking ${filename}...`);
-
-  // Load merged data
-  let records: MergedRecord[];
-  try {
-    const raw = await fs.readFile(filepath, "utf8");
-    records = JSON.parse(raw) as MergedRecord[];
-  } catch (e) {
-    console.warn(`   ⚠️  Could not read file: ${(e as Error).message}`);
-    return;
-  }
-
-  console.log(`   Loaded ${records.length.toLocaleString("en-US")} entries`);
-
-  // Initialize stats
-  const stats: CoverageStats = {
-    total: records.length,
-    matched: 0,
-    unmatched: 0,
-    byCefr: {} as Record<CEFRLevel, { total: number; matched: number }>,
-    byDifficulty: {} as Record<Difficulty, { total: number; matched: number }>,
-    unmatchedWords: [],
-  };
-
-  for (const level of CEFR_LEVELS)
-    stats.byCefr[level] = { total: 0, matched: 0 };
-  for (const diff of DIFFICULTY_LEVELS)
-    stats.byDifficulty[diff] = { total: 0, matched: 0 };
-
-  // ── BATCHED LOOKUP: Build a Set of existing (word, pos) pairs in DB ──
-  console.log(`   🔍 Querying database for existing translations...`);
-
-  // Get all existing translations for this language + POS combo
-  const existingRows = await db
-    .select({ text: translations.text, pos: terms.pos })
-    .from(translations)
-    .innerJoin(terms, eq(translations.term_id, terms.id))
-    .where(eq(translations.language_code, language));
-
-  // Create a Set for O(1) lookup: "word|pos" -> true
-  const existingSet = new Set(
-    existingRows.map((row) => `${row.text.toLowerCase()}|${row.pos}`),
-  );
-
-  // ── Process records against the in-memory Set ──
-  for (const record of records) {
-    stats.byCefr[record.cefr].total++;
-    stats.byDifficulty[record.difficulty].total++;
-
-    const key = `${record.word.toLowerCase()}|${record.pos}`;
-
-    if (existingSet.has(key)) {
-      stats.matched++;
-      stats.byCefr[record.cefr].matched++;
-      stats.byDifficulty[record.difficulty].matched++;
-    } else {
-      stats.unmatched++;
-      if (stats.unmatchedWords.length < 20) {
-        stats.unmatchedWords.push({
-          word: record.word,
-          pos: record.pos,
-          cefr: record.cefr,
-        });
-      }
-    }
-  }
-
-  // ── Print results (same as your draft) ──
-  console.log(`\n📊 Coverage for ${language}:`);
-  console.log(`   Total entries:    ${stats.total.toLocaleString("en-US")}`);
-  console.log(
-    `   Matched in DB:    ${stats.matched.toLocaleString("en-US")} (${((stats.matched / stats.total) * 100).toFixed(1)}%)`,
-  );
-  console.log(
-    `   Unmatched:        ${stats.unmatched.toLocaleString("en-US")} (${((stats.unmatched / stats.total) * 100).toFixed(1)}%)`,
-  );
-
-  console.log(`\n   By CEFR level:`);
-  for (const level of CEFR_LEVELS) {
-    const { total, matched } = stats.byCefr[level];
-    if (total > 0) {
-      const pct = ((matched / total) * 100).toFixed(1);
-      console.log(
-        `      ${level}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
-      );
-    }
-  }
-
-  console.log(`\n   By difficulty:`);
-  for (const diff of DIFFICULTY_LEVELS) {
-    const { total, matched } = stats.byDifficulty[diff];
-    if (total > 0) {
-      const pct = ((matched / total) * 100).toFixed(1);
-      console.log(
-        `      ${diff}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
-      );
-    }
-  }
-
-  if (stats.unmatchedWords.length > 0) {
-    console.log(`\n⚠️  Sample unmatched words (first 20):`);
-    for (const { word, pos, cefr } of stats.unmatchedWords) {
-      console.log(`      "${word}" (${pos}, ${cefr})`);
-    }
-    if (stats.unmatched > 20) {
-      console.log(`      ... and ${stats.unmatched - 20} more`);
-    }
-  }
-}
-
-const main = async () => {
-  console.log("##########################################");
-  console.log("lila — CEFR Coverage Check");
-  console.log("##########################################");
-
-  for (const language of SUPPORTED_LANGUAGE_CODES) {
-    await checkCoverage(language);
-  }
-
-  console.log("\n##########################################");
-  console.log("Done");
-  console.log("##########################################");
-};
-
-main().catch((err) => {
-  console.error(err);
-  process.exit(1);
-});
--- a/packages/db/src/data/en-merged.json
+++ b/packages/db/src/data/en-merged.json
--- a/packages/db/src/data/it-merged.json
+++ b/packages/db/src/data/it-merged.json
--- a/packages/db/src/data/omw-noun.json
+++ b/packages/db/src/data/omw-noun.json
--- a/packages/db/src/data/omw-verb.json
+++ b/packages/db/src/data/omw-verb.json
--- a/packages/db/src/generating-deck.ts
+++ b/packages/db/src/generating-deck.ts
@ -1,211 +0,0 @@
-import fs from "node:fs/promises";
-import { db } from "@lila/db";
-import { translations, terms, decks, deck_terms } from "@lila/db/schema";
-import { inArray, and, eq, ne, countDistinct } from "drizzle-orm";
-
-type DbOrTx = Parameters<Parameters<typeof db.transaction>[0]>[0];
-
-const config = {
-  pathToWordlist: "./src/data/wordlists/top1000englishnouns",
-  deckName: "top english nouns",
-  deckDescription: "Most frequently used English nouns for vocabulary practice",
-  sourceLanguage: "en",
-  sourcePOS: "noun",
-} as const;
-
-const readWordList = async () => {
-  const raw = await fs.readFile(config.pathToWordlist, "utf8");
-  const words = [
-    ...new Set(
-      raw
-        .split("\n")
-        .map((w) => w.trim().toLowerCase())
-        .filter(Boolean),
-    ),
-  ];
-  return words;
-};
-
-const resolveSourceTerms = async (words: string[]) => {
-  const rows = await db
-    .select({ text: translations.text, termId: translations.term_id })
-    .from(translations)
-    .innerJoin(terms, eq(translations.term_id, terms.id))
-    .where(
-      and(
-        inArray(translations.text, words),
-        eq(translations.language_code, config.sourceLanguage),
-        eq(terms.pos, config.sourcePOS),
-      ),
-    );
-
-  const wordToTermIds = new Map<string, string[]>();
-  for (const row of rows) {
-    const word = row.text.toLowerCase();
-
-    if (!wordToTermIds.has(word)) {
-      wordToTermIds.set(word, []);
-    }
-    wordToTermIds.get(word)!.push(row.termId);
-  }
-  // Deduplicate: multiple words can map to the same term ID (e.g. via synonyms)
-  const termIds = [...new Set(Array.from(wordToTermIds.values()).flat())];
-  const missingWords = words.filter((w) => !wordToTermIds.has(w));
-
-  return { termIds, missingWords };
-};
-
-const writeMissingWordsToFile = async (missingWords: string[]) => {
-  const outputPath = `${config.pathToWordlist}-missing`;
-  await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
-};
-
-const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
-  const coverage = await db
-    .select({
-      language: translations.language_code,
-      coveredCount: countDistinct(translations.term_id),
-    })
-    .from(translations)
-    .where(
-      and(
-        inArray(translations.term_id, termIds),
-        ne(translations.language_code, sourceLanguage),
-      ),
-    )
-    .groupBy(translations.language_code);
-
-  const validatedLanguages = coverage
-    .filter((row) => Number(row.coveredCount) === termIds.length)
-    .map((row) => row.language);
-
-  return { coverage, validatedLanguages };
-};
-
-const findExistingDeck = async (tx: DbOrTx) => {
-  const existing = await tx
-    .select({ id: decks.id, validatedForLanguages: decks.validated_languages })
-    .from(decks)
-    .where(
-      and(
-        eq(decks.name, config.deckName),
-        eq(decks.source_language, config.sourceLanguage),
-      ),
-    );
-  return existing[0] ?? null;
-};
-
-const createDeck = async (tx: DbOrTx, validatedLanguages: string[]) => {
-  const result = await tx
-    .insert(decks)
-    .values({
-      name: config.deckName,
-      description: config.deckDescription,
-      source_language: config.sourceLanguage,
-      validated_languages: validatedLanguages,
-      type: "core",
-    })
-    .returning({ id: decks.id });
-  const created = result[0];
-  if (!created) throw new Error("Failed to create deck: no row returned");
-  return created.id;
-};
-
-const addTermsToDeck = async (
-  tx: DbOrTx,
-  deckId: string,
-  termIds: string[],
-): Promise<number> => {
-  if (termIds.length === 0) return 0;
-
-  await tx
-    .insert(deck_terms)
-    .values(termIds.map((termId) => ({ deck_id: deckId, term_id: termId })))
-    .onConflictDoNothing();
-
-  return termIds.length;
-};
-
-const updateValidatedLanguages = async (
-  tx: DbOrTx,
-  deckId: string,
-  validatedLanguages: string[],
-): Promise<void> => {
-  await tx
-    .update(decks)
-    .set({ validated_languages: validatedLanguages })
-    .where(eq(decks.id, deckId));
-};
-
-const main = async () => {
-  console.log("📖 Reading word list...");
-  const sourceWords = await readWordList();
-  console.log(`   ${sourceWords.length} words loaded\n`);
-
-  console.log("🔍 Checking against database...");
-  const { termIds, missingWords } = await resolveSourceTerms(sourceWords);
-  console.log(`   ${termIds.length} terms found`);
-  console.log(`   ${missingWords.length} words not found in DB\n`);
-
-  console.log("🖊️ Writing missing words to file...\n");
-  await writeMissingWordsToFile(missingWords);
-
-  console.log("✅ Validating languages...");
-  const { coverage, validatedLanguages } = await validateLanguages(
-    config.sourceLanguage,
-    termIds,
-  );
-  console.log(
-    `   Validated languages: ${JSON.stringify(validatedLanguages)}\n`,
-  );
-
-  console.log("🔬 Language coverage breakdown...");
-  for (const row of coverage) {
-    console.log(
-      `  ${row.language}: ${row.coveredCount} / ${termIds.length} terms covered`,
-    );
-  }
-
-  console.log("🃏 Looking for existing deck...");
-  const addedCount = await db.transaction(async (tx) => {
-    const existingDeck = await findExistingDeck(tx);
-    const deckId = existingDeck
-      ? existingDeck.id
-      : await createDeck(tx, validatedLanguages);
-
-    const addedCount = await addTermsToDeck(tx, deckId, termIds);
-
-    const currentLanguages = existingDeck?.validatedForLanguages ?? [];
-    const hasChanged =
-      JSON.stringify([...currentLanguages].sort()) !==
-      JSON.stringify([...validatedLanguages].sort());
-
-    if (hasChanged) {
-      await updateValidatedLanguages(tx, deckId, validatedLanguages);
-    }
-
-    return addedCount;
-  });
-  const alreadyPresentCount = termIds.length - addedCount;
-
-  console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
-  console.log("📊 Summary");
-  console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
-  console.log(`   Words loaded from wordlist : ${sourceWords.length}`);
-  console.log(
-    `   Words matched in DB        : ${sourceWords.length - missingWords.length}`,
-  );
-  console.log(`   Words not found in DB      : ${missingWords.length}`);
-  console.log(`   Term IDs resolved          : ${termIds.length}`);
-  console.log(`   Terms added to deck        : ${addedCount}`);
-  console.log(`   Terms already in deck      : ${alreadyPresentCount}`);
-  console.log(
-    `   Validated languages        : ${validatedLanguages.length > 0 ? validatedLanguages.join(", ") : "none"}`,
-  );
-  console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
-};
-
-main().catch((error) => {
-  console.error(error);
-  process.exit(1);
-});
--- a/packages/db/src/seeding-cefr-levels.ts
+++ b/packages/db/src/seeding-cefr-levels.ts
@ -1,148 +0,0 @@
-import fs from "node:fs/promises";
-import { eq, inArray } from "drizzle-orm";
-
-import {
-  SUPPORTED_LANGUAGE_CODES,
-  SUPPORTED_POS,
-  CEFR_LEVELS,
-  DIFFICULTY_LEVELS,
-} from "@lila/shared";
-import { db } from "@lila/db";
-import { translations, terms } from "@lila/db/schema";
-
-type POS = (typeof SUPPORTED_POS)[number];
-type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
-type CEFRLevel = (typeof CEFR_LEVELS)[number];
-type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
-
-type MergedRecord = {
-  word: string;
-  pos: POS;
-  cefr: CEFRLevel;
-  difficulty: Difficulty;
-  sources: string[];
-};
-
-const dataDir = "./src/data/";
-const BATCH_SIZE = 500;
-
-// ────────────────────────────────────────────────────────────
-// Helpers
-// ────────────────────────────────────────────────────────────
-
-function chunk<T>(arr: T[], size: number): T[][] {
-  const out: T[][] = [];
-  for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
-  return out;
-}
-
-function fmt(n: number): string {
-  return n.toLocaleString("en-US");
-}
-
-// ────────────────────────────────────────────────────────────
-// Enrichment per language
-// ────────────────────────────────────────────────────────────
-
-async function enrichLanguage(language: LanguageCode): Promise<void> {
-  const filename = `${language}-merged.json`;
-  const filepath = dataDir + filename;
-
-  console.log(`\n📝 Enriching ${filename}...`);
-
-  let records: MergedRecord[];
-  try {
-    const raw = await fs.readFile(filepath, "utf8");
-    records = JSON.parse(raw) as MergedRecord[];
-  } catch (e) {
-    console.warn(`   ⚠️  Could not read file: ${(e as Error).message}`);
-    return;
-  }
-
-  console.log(`   Loaded ${fmt(records.length)} entries`);
-
-  // 1. Bulk fetch existing translations for this language
-  console.log(`   🔍 Fetching existing translations from DB...`);
-  const existingTranslations = await db
-    .select({ id: translations.id, text: translations.text, pos: terms.pos })
-    .from(translations)
-    .innerJoin(terms, eq(translations.term_id, terms.id))
-    .where(eq(translations.language_code, language));
-
-  // 2. Build lookup map: "lowercase_word|pos" -> translation IDs
-  const translationMap = new Map<string, string[]>();
-  for (const t of existingTranslations) {
-    const key = `${t.text.toLowerCase()}|${t.pos}`;
-    if (!translationMap.has(key)) translationMap.set(key, []);
-    translationMap.get(key)!.push(t.id);
-  }
-
-  // 3. Match records to DB IDs and group by target (cefr, difficulty)
-  const updatesByValue = new Map<string, string[]>();
-  const unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }> = [];
-
-  for (const rec of records) {
-    const key = `${rec.word.toLowerCase()}|${rec.pos}`;
-    const ids = translationMap.get(key);
-
-    if (ids && ids.length > 0) {
-      const valueKey = `${rec.cefr}|${rec.difficulty}`;
-      if (!updatesByValue.has(valueKey)) updatesByValue.set(valueKey, []);
-      updatesByValue.get(valueKey)!.push(...ids);
-    } else {
-      unmatchedWords.push({ word: rec.word, pos: rec.pos, cefr: rec.cefr });
-    }
-  }
-
-  // 4. Batch updates grouped by (cefr, difficulty)
-  let totalUpdated = 0;
-  for (const [valueKey, ids] of updatesByValue.entries()) {
-    const [cefr, difficulty] = valueKey.split("|") as [CEFRLevel, Difficulty];
-    const uniqueIds = [...new Set(ids)]; // Deduplicate synonyms/duplicates
-
-    for (const idBatch of chunk(uniqueIds, BATCH_SIZE)) {
-      await db
-        .update(translations)
-        .set({ cefr_level: cefr, difficulty })
-        .where(inArray(translations.id, idBatch));
-      totalUpdated += idBatch.length;
-    }
-  }
-
-  // 5. Summary
-  console.log(`\n   ✅ Updated ${fmt(totalUpdated)} translations`);
-  console.log(`   ⚠️  Unmatched: ${fmt(unmatchedWords.length)}`);
-
-  if (unmatchedWords.length > 0) {
-    console.log(`\n   Sample unmatched words (first 20):`);
-    for (const { word, pos, cefr } of unmatchedWords.slice(0, 20)) {
-      console.log(`      "${word}" (${pos}, ${cefr})`);
-    }
-    if (unmatchedWords.length > 20) {
-      console.log(`      ... and ${fmt(unmatchedWords.length - 20)} more`);
-    }
-  }
-}
-
-// ────────────────────────────────────────────────────────────
-// Main
-// ────────────────────────────────────────────────────────────
-
-const main = async () => {
-  console.log("##########################################");
-  console.log("lila — CEFR Enrichment");
-  console.log("##########################################\n");
-
-  for (const lang of SUPPORTED_LANGUAGE_CODES) {
-    await enrichLanguage(lang);
-  }
-
-  console.log("\n##########################################");
-  console.log("Done");
-  console.log("##########################################");
-};
-
-main().catch((err) => {
-  console.error(err);
-  process.exit(1);
-});
--- a/packages/db/src/seeding-datafiles.ts
+++ b/packages/db/src/seeding-datafiles.ts
@ -1,212 +0,0 @@
-import fs from "node:fs/promises";
-import { and, count, eq, inArray } from "drizzle-orm";
-
-import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@lila/shared";
-import { db } from "@lila/db";
-import { terms, translations, term_glosses } from "@lila/db/schema";
-
-type POS = (typeof SUPPORTED_POS)[number];
-type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
-
-type SynsetRecord = {
-  source_id: string;
-  pos: POS;
-  translations: Partial<Record<LanguageCode, string[]>>;
-  glosses: Partial<Record<LanguageCode, string[]>>;
-};
-
-const dataDir = "./src/data/";
-const BATCH_SIZE = 500;
-
-// ────────────────────────────────────────────────────────────
-// Helpers
-// ────────────────────────────────────────────────────────────
-
-function chunk<T>(arr: T[], size: number): T[][] {
-  const out: T[][] = [];
-  for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
-  return out;
-}
-
-function fmt(n: number): string {
-  return n.toLocaleString("en-US");
-}
-
-// ────────────────────────────────────────────────────────────
-// Stats
-// ────────────────────────────────────────────────────────────
-
-const stats = {
-  terms: { inserted: 0, skipped: 0 },
-  translations: { inserted: 0, skipped: 0 },
-  glosses: { inserted: 0, skipped: 0 },
-};
-
-// ────────────────────────────────────────────────────────────
-// Per-batch processing
-// ────────────────────────────────────────────────────────────
-
-async function processBatch(batch: SynsetRecord[]): Promise<void> {
-  // 1. Insert terms — idempotency key: (source, source_id)
-  const termValues = batch.map((r) => ({
-    source: "omw" as const,
-    source_id: r.source_id,
-    pos: r.pos,
-  }));
-
-  const insertedTerms = await db
-    .insert(terms)
-    .values(termValues)
-    .onConflictDoNothing()
-    .returning({ id: terms.id });
-
-  stats.terms.inserted += insertedTerms.length;
-  stats.terms.skipped += batch.length - insertedTerms.length;
-
-  // 2. Resolve UUIDs for every source_id in this batch (new + pre-existing).
-  //    We can't rely solely on the .returning() above because onConflictDoNothing
-  //    returns nothing for rows that already existed.
-  const sourceIds = batch.map((r) => r.source_id);
-  const termRows = await db
-    .select({ id: terms.id, source_id: terms.source_id })
-    .from(terms)
-    .where(and(eq(terms.source, "omw"), inArray(terms.source_id, sourceIds)));
-
-  const sourceIdToTermId = new Map(termRows.map((r) => [r.source_id, r.id]));
-
-  // 3. Build and insert translation rows
-  const translationRows = batch.flatMap((r) => {
-    const termId = sourceIdToTermId.get(r.source_id);
-    if (!termId) return [];
-    return Object.entries(r.translations).flatMap(([lang, lemmas]) =>
-      (lemmas ?? []).map((text) => ({
-        term_id: termId,
-        language_code: lang as LanguageCode,
-        text,
-      })),
-    );
-  });
-
-  for (const tBatch of chunk(translationRows, BATCH_SIZE)) {
-    const inserted = await db
-      .insert(translations)
-      .values(tBatch)
-      .onConflictDoNothing()
-      .returning({ id: translations.id });
-
-    stats.translations.inserted += inserted.length;
-    stats.translations.skipped += tBatch.length - inserted.length;
-  }
-
-  // 4. Build and insert gloss rows
-  const glossRows = batch.flatMap((r) => {
-    const termId = sourceIdToTermId.get(r.source_id);
-    if (!termId) return [];
-    return Object.entries(r.glosses ?? {}).flatMap(([lang, texts]) =>
-      (texts ?? []).map((text) => ({
-        term_id: termId,
-        language_code: lang as LanguageCode,
-        text,
-      })),
-    );
-  });
-
-  for (const gBatch of chunk(glossRows, BATCH_SIZE)) {
-    const inserted = await db
-      .insert(term_glosses)
-      .values(gBatch)
-      .onConflictDoNothing()
-      .returning({ id: term_glosses.id });
-
-    stats.glosses.inserted += inserted.length;
-    stats.glosses.skipped += gBatch.length - inserted.length;
-  }
-}
-
-// ────────────────────────────────────────────────────────────
-// Main
-// ────────────────────────────────────────────────────────────
-
-const main = async () => {
-  console.log("\n##########################################");
-  console.log("lila — OMW seed");
-  console.log("##########################################\n");
-
-  // One file per POS — names are derived from SUPPORTED_POS so adding a new
-  // constant value automatically picks up a new file on the next run.
-  const posToFile = Object.fromEntries(
-    SUPPORTED_POS.map((pos) => [pos, `omw-${pos}.json`]),
-  ) as Record<POS, string>;
-
-  for (const pos of SUPPORTED_POS) {
-    const filename = posToFile[pos];
-    const filepath = dataDir + filename;
-
-    console.log(`📄 ${filename}`);
-
-    let records: SynsetRecord[];
-    try {
-      const raw = await fs.readFile(filepath, "utf8");
-      records = JSON.parse(raw) as SynsetRecord[];
-    } catch (e) {
-      console.warn(
-        `   ⚠️  Skipping — could not read file: ${(e as Error).message}\n`,
-      );
-      continue;
-    }
-
-    console.log(`   Loaded ${fmt(records.length)} synsets`);
-
-    const batches = chunk(records, BATCH_SIZE);
-
-    for (const [i, batch] of batches.entries()) {
-      // Progress every 5 000 synsets
-      if (i > 0 && i % 10 === 0) {
-        const processed = i * BATCH_SIZE;
-        console.log(`   ⏳ ${fmt(processed)} / ${fmt(records.length)}`);
-      }
-      await processBatch(batch);
-    }
-
-    console.log(`   ✅ Done\n`);
-  }
-
-  // ── Summary ───────────────────────────────────────────────
-
-  console.log("##########################################");
-  console.log("Summary");
-  console.log("##########################################\n");
-
-  const pad = (label: string) => label.padEnd(14);
-
-  console.log(
-    `${pad("Terms:")}inserted ${fmt(stats.terms.inserted)}, skipped ${fmt(stats.terms.skipped)}`,
-  );
-  console.log(
-    `${pad("Translations:")}inserted ${fmt(stats.translations.inserted)}, skipped ${fmt(stats.translations.skipped)}`,
-  );
-  console.log(
-    `${pad("Glosses:")}inserted ${fmt(stats.glosses.inserted)}, skipped ${fmt(stats.glosses.skipped)}`,
-  );
-
-  // Query actual DB totals — insert-based counters show 0 on re-runs.
-  console.log("\nCoverage per language (total in DB):");
-  for (const lang of SUPPORTED_LANGUAGE_CODES) {
-    const [tRow] = await db
-      .select({ n: count() })
-      .from(translations)
-      .where(eq(translations.language_code, lang));
-    const [gRow] = await db
-      .select({ n: count() })
-      .from(term_glosses)
-      .where(eq(term_glosses.language_code, lang));
-    console.log(
-      `  ${lang}: ${fmt(tRow?.n ?? 0)} translations, ${fmt(gRow?.n ?? 0)} glosses`,
-    );
-  }
-};
-
-main().catch((err) => {
-  console.error(err);
-  process.exit(1);
-});
--- a/packages/db/tsconfig.json
+++ b/packages/db/tsconfig.json
@ -5,7 +5,11 @@
    "moduleResolution": "NodeNext",
    "outDir": "./dist",
    "resolveJsonModule": true,
-    "types": ["vitest/globals"]
+    "types": ["vitest/globals"],
  },
-  "include": ["src", "vitest.config.ts"]
+  "include": [
+    "src",
+    "vitest.config.ts",
+    "../../data-pipeline/archive/packages-db-src-old-seeding-scripts/data",
+  ],
 }