adding script to check cefr coverage between json files and database, adding script to write cefr levels from json to db

2026-04-09 10:25:20 +02:00 · 2026-04-09 10:25:20 +02:00 · 13cc709b09
commit 13cc709b09
parent 3374bd8b20
7 changed files with 279296 additions and 1 deletions
--- a/packages/db/src/seeding-cefr-levels.ts
+++ b/packages/db/src/seeding-cefr-levels.ts
@ -0,0 +1,148 @@
+import fs from "node:fs/promises";
+import { eq, inArray } from "drizzle-orm";
+
+import {
+  SUPPORTED_LANGUAGE_CODES,
+  SUPPORTED_POS,
+  CEFR_LEVELS,
+  DIFFICULTY_LEVELS,
+} from "@glossa/shared";
+import { db } from "@glossa/db";
+import { translations, terms } from "@glossa/db/schema";
+
+type POS = (typeof SUPPORTED_POS)[number];
+type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
+type CEFRLevel = (typeof CEFR_LEVELS)[number];
+type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
+
+type MergedRecord = {
+  word: string;
+  pos: POS;
+  cefr: CEFRLevel;
+  difficulty: Difficulty;
+  sources: string[];
+};
+
+const dataDir = "./src/data/";
+const BATCH_SIZE = 500;
+
+// ────────────────────────────────────────────────────────────
+// Helpers
+// ────────────────────────────────────────────────────────────
+
+function chunk<T>(arr: T[], size: number): T[][] {
+  const out: T[][] = [];
+  for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
+  return out;
+}
+
+function fmt(n: number): string {
+  return n.toLocaleString("en-US");
+}
+
+// ────────────────────────────────────────────────────────────
+// Enrichment per language
+// ────────────────────────────────────────────────────────────
+
+async function enrichLanguage(language: LanguageCode): Promise<void> {
+  const filename = `${language}-merged.json`;
+  const filepath = dataDir + filename;
+
+  console.log(`\n📝 Enriching ${filename}...`);
+
+  let records: MergedRecord[];
+  try {
+    const raw = await fs.readFile(filepath, "utf8");
+    records = JSON.parse(raw) as MergedRecord[];
+  } catch (e) {
+    console.warn(`   ⚠️  Could not read file: ${(e as Error).message}`);
+    return;
+  }
+
+  console.log(`   Loaded ${fmt(records.length)} entries`);
+
+  // 1. Bulk fetch existing translations for this language
+  console.log(`   🔍 Fetching existing translations from DB...`);
+  const existingTranslations = await db
+    .select({ id: translations.id, text: translations.text, pos: terms.pos })
+    .from(translations)
+    .innerJoin(terms, eq(translations.term_id, terms.id))
+    .where(eq(translations.language_code, language));
+
+  // 2. Build lookup map: "lowercase_word|pos" -> translation IDs
+  const translationMap = new Map<string, string[]>();
+  for (const t of existingTranslations) {
+    const key = `${t.text.toLowerCase()}|${t.pos}`;
+    if (!translationMap.has(key)) translationMap.set(key, []);
+    translationMap.get(key)!.push(t.id);
+  }
+
+  // 3. Match records to DB IDs and group by target (cefr, difficulty)
+  const updatesByValue = new Map<string, string[]>();
+  const unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }> = [];
+
+  for (const rec of records) {
+    const key = `${rec.word.toLowerCase()}|${rec.pos}`;
+    const ids = translationMap.get(key);
+
+    if (ids && ids.length > 0) {
+      const valueKey = `${rec.cefr}|${rec.difficulty}`;
+      if (!updatesByValue.has(valueKey)) updatesByValue.set(valueKey, []);
+      updatesByValue.get(valueKey)!.push(...ids);
+    } else {
+      unmatchedWords.push({ word: rec.word, pos: rec.pos, cefr: rec.cefr });
+    }
+  }
+
+  // 4. Batch updates grouped by (cefr, difficulty)
+  let totalUpdated = 0;
+  for (const [valueKey, ids] of updatesByValue.entries()) {
+    const [cefr, difficulty] = valueKey.split("|") as [CEFRLevel, Difficulty];
+    const uniqueIds = [...new Set(ids)]; // Deduplicate synonyms/duplicates
+
+    for (const idBatch of chunk(uniqueIds, BATCH_SIZE)) {
+      await db
+        .update(translations)
+        .set({ cefr_level: cefr, difficulty })
+        .where(inArray(translations.id, idBatch));
+      totalUpdated += idBatch.length;
+    }
+  }
+
+  // 5. Summary
+  console.log(`\n   ✅ Updated ${fmt(totalUpdated)} translations`);
+  console.log(`   ⚠️  Unmatched: ${fmt(unmatchedWords.length)}`);
+
+  if (unmatchedWords.length > 0) {
+    console.log(`\n   Sample unmatched words (first 20):`);
+    for (const { word, pos, cefr } of unmatchedWords.slice(0, 20)) {
+      console.log(`      "${word}" (${pos}, ${cefr})`);
+    }
+    if (unmatchedWords.length > 20) {
+      console.log(`      ... and ${fmt(unmatchedWords.length - 20)} more`);
+    }
+  }
+}
+
+// ────────────────────────────────────────────────────────────
+// Main
+// ────────────────────────────────────────────────────────────
+
+const main = async () => {
+  console.log("##########################################");
+  console.log("Glossa — CEFR Enrichment");
+  console.log("##########################################\n");
+
+  for (const lang of SUPPORTED_LANGUAGE_CODES) {
+    await enrichLanguage(lang);
+  }
+
+  console.log("\n##########################################");
+  console.log("Done");
+  console.log("##########################################");
+};
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});