adding script to check cefr coverage between json files and database, adding script to write cefr levels from json to db
This commit is contained in:
parent
3374bd8b20
commit
13cc709b09
7 changed files with 279296 additions and 1 deletions
183
packages/db/src/checking-cefr-coverage.ts
Normal file
183
packages/db/src/checking-cefr-coverage.ts
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
/*
|
||||
|
||||
This script performs a cross-reference check between two specific data sets:
|
||||
|
||||
- The "Target" List: It reads the {language}-merged.json file (e.g., en-merged.json). This represents the vocabulary you want to have CEFR levels for.
|
||||
- The "Source of Truth": It queries your Database (translations table). This represents the vocabulary you currently have in your app.
|
||||
|
||||
What it calculates:
|
||||
It tells you: "Of all the words in my merged JSON file, how many actually exist in my database?"
|
||||
|
||||
Matched: The word from the JSON file was found in the DB. (Ready for enrichment).
|
||||
Unmatched: The word from the JSON file was not found in the DB. (These will be skipped during enrichment).
|
||||
|
||||
*/
|
||||
|
||||
import fs from "node:fs/promises";
|
||||
import { eq } from "drizzle-orm";
|
||||
|
||||
import {
|
||||
SUPPORTED_LANGUAGE_CODES,
|
||||
SUPPORTED_POS,
|
||||
CEFR_LEVELS,
|
||||
DIFFICULTY_LEVELS,
|
||||
} from "@glossa/shared";
|
||||
import { db } from "@glossa/db";
|
||||
import { terms, translations } from "@glossa/db/schema";
|
||||
|
||||
type POS = (typeof SUPPORTED_POS)[number];
|
||||
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
|
||||
type CEFRLevel = (typeof CEFR_LEVELS)[number];
|
||||
type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
|
||||
|
||||
type MergedRecord = {
|
||||
word: string;
|
||||
pos: POS;
|
||||
cefr: CEFRLevel;
|
||||
difficulty: Difficulty;
|
||||
sources: string[];
|
||||
};
|
||||
|
||||
type CoverageStats = {
|
||||
total: number;
|
||||
matched: number;
|
||||
unmatched: number;
|
||||
byCefr: Record<CEFRLevel, { total: number; matched: number }>;
|
||||
byDifficulty: Record<Difficulty, { total: number; matched: number }>;
|
||||
unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }>;
|
||||
};
|
||||
|
||||
const dataDir = "./src/data/";
|
||||
|
||||
async function checkCoverage(language: LanguageCode): Promise<void> {
|
||||
const filename = `${language}-merged.json`;
|
||||
const filepath = dataDir + filename;
|
||||
|
||||
console.log(`\n📄 Checking ${filename}...`);
|
||||
|
||||
// Load merged data
|
||||
let records: MergedRecord[];
|
||||
try {
|
||||
const raw = await fs.readFile(filepath, "utf8");
|
||||
records = JSON.parse(raw) as MergedRecord[];
|
||||
} catch (e) {
|
||||
console.warn(` ⚠️ Could not read file: ${(e as Error).message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(` Loaded ${records.length.toLocaleString("en-US")} entries`);
|
||||
|
||||
// Initialize stats
|
||||
const stats: CoverageStats = {
|
||||
total: records.length,
|
||||
matched: 0,
|
||||
unmatched: 0,
|
||||
byCefr: {} as Record<CEFRLevel, { total: number; matched: number }>,
|
||||
byDifficulty: {} as Record<Difficulty, { total: number; matched: number }>,
|
||||
unmatchedWords: [],
|
||||
};
|
||||
|
||||
for (const level of CEFR_LEVELS)
|
||||
stats.byCefr[level] = { total: 0, matched: 0 };
|
||||
for (const diff of DIFFICULTY_LEVELS)
|
||||
stats.byDifficulty[diff] = { total: 0, matched: 0 };
|
||||
|
||||
// ── BATCHED LOOKUP: Build a Set of existing (word, pos) pairs in DB ──
|
||||
console.log(` 🔍 Querying database for existing translations...`);
|
||||
|
||||
// Get all existing translations for this language + POS combo
|
||||
const existingRows = await db
|
||||
.select({ text: translations.text, pos: terms.pos })
|
||||
.from(translations)
|
||||
.innerJoin(terms, eq(translations.term_id, terms.id))
|
||||
.where(eq(translations.language_code, language));
|
||||
|
||||
// Create a Set for O(1) lookup: "word|pos" -> true
|
||||
const existingSet = new Set(
|
||||
existingRows.map((row) => `${row.text.toLowerCase()}|${row.pos}`),
|
||||
);
|
||||
|
||||
// ── Process records against the in-memory Set ──
|
||||
for (const record of records) {
|
||||
stats.byCefr[record.cefr].total++;
|
||||
stats.byDifficulty[record.difficulty].total++;
|
||||
|
||||
const key = `${record.word.toLowerCase()}|${record.pos}`;
|
||||
|
||||
if (existingSet.has(key)) {
|
||||
stats.matched++;
|
||||
stats.byCefr[record.cefr].matched++;
|
||||
stats.byDifficulty[record.difficulty].matched++;
|
||||
} else {
|
||||
stats.unmatched++;
|
||||
if (stats.unmatchedWords.length < 20) {
|
||||
stats.unmatchedWords.push({
|
||||
word: record.word,
|
||||
pos: record.pos,
|
||||
cefr: record.cefr,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Print results (same as your draft) ──
|
||||
console.log(`\n📊 Coverage for ${language}:`);
|
||||
console.log(` Total entries: ${stats.total.toLocaleString("en-US")}`);
|
||||
console.log(
|
||||
` Matched in DB: ${stats.matched.toLocaleString("en-US")} (${((stats.matched / stats.total) * 100).toFixed(1)}%)`,
|
||||
);
|
||||
console.log(
|
||||
` Unmatched: ${stats.unmatched.toLocaleString("en-US")} (${((stats.unmatched / stats.total) * 100).toFixed(1)}%)`,
|
||||
);
|
||||
|
||||
console.log(`\n By CEFR level:`);
|
||||
for (const level of CEFR_LEVELS) {
|
||||
const { total, matched } = stats.byCefr[level];
|
||||
if (total > 0) {
|
||||
const pct = ((matched / total) * 100).toFixed(1);
|
||||
console.log(
|
||||
` ${level}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n By difficulty:`);
|
||||
for (const diff of DIFFICULTY_LEVELS) {
|
||||
const { total, matched } = stats.byDifficulty[diff];
|
||||
if (total > 0) {
|
||||
const pct = ((matched / total) * 100).toFixed(1);
|
||||
console.log(
|
||||
` ${diff}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (stats.unmatchedWords.length > 0) {
|
||||
console.log(`\n⚠️ Sample unmatched words (first 20):`);
|
||||
for (const { word, pos, cefr } of stats.unmatchedWords) {
|
||||
console.log(` "${word}" (${pos}, ${cefr})`);
|
||||
}
|
||||
if (stats.unmatched > 20) {
|
||||
console.log(` ... and ${stats.unmatched - 20} more`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const main = async () => {
|
||||
console.log("##########################################");
|
||||
console.log("Glossa — CEFR Coverage Check");
|
||||
console.log("##########################################");
|
||||
|
||||
for (const language of SUPPORTED_LANGUAGE_CODES) {
|
||||
await checkCoverage(language);
|
||||
}
|
||||
|
||||
console.log("\n##########################################");
|
||||
console.log("Done");
|
||||
console.log("##########################################");
|
||||
};
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
165296
packages/db/src/data/en-merged.json
Normal file
165296
packages/db/src/data/en-merged.json
Normal file
File diff suppressed because it is too large
Load diff
113668
packages/db/src/data/it-merged.json
Normal file
113668
packages/db/src/data/it-merged.json
Normal file
File diff suppressed because it is too large
Load diff
148
packages/db/src/seeding-cefr-levels.ts
Normal file
148
packages/db/src/seeding-cefr-levels.ts
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
import fs from "node:fs/promises";
|
||||
import { eq, inArray } from "drizzle-orm";
|
||||
|
||||
import {
|
||||
SUPPORTED_LANGUAGE_CODES,
|
||||
SUPPORTED_POS,
|
||||
CEFR_LEVELS,
|
||||
DIFFICULTY_LEVELS,
|
||||
} from "@glossa/shared";
|
||||
import { db } from "@glossa/db";
|
||||
import { translations, terms } from "@glossa/db/schema";
|
||||
|
||||
type POS = (typeof SUPPORTED_POS)[number];
|
||||
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
|
||||
type CEFRLevel = (typeof CEFR_LEVELS)[number];
|
||||
type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
|
||||
|
||||
type MergedRecord = {
|
||||
word: string;
|
||||
pos: POS;
|
||||
cefr: CEFRLevel;
|
||||
difficulty: Difficulty;
|
||||
sources: string[];
|
||||
};
|
||||
|
||||
const dataDir = "./src/data/";
|
||||
const BATCH_SIZE = 500;
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// Helpers
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
||||
function chunk<T>(arr: T[], size: number): T[][] {
|
||||
const out: T[][] = [];
|
||||
for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
|
||||
return out;
|
||||
}
|
||||
|
||||
function fmt(n: number): string {
|
||||
return n.toLocaleString("en-US");
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// Enrichment per language
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
||||
async function enrichLanguage(language: LanguageCode): Promise<void> {
|
||||
const filename = `${language}-merged.json`;
|
||||
const filepath = dataDir + filename;
|
||||
|
||||
console.log(`\n📝 Enriching ${filename}...`);
|
||||
|
||||
let records: MergedRecord[];
|
||||
try {
|
||||
const raw = await fs.readFile(filepath, "utf8");
|
||||
records = JSON.parse(raw) as MergedRecord[];
|
||||
} catch (e) {
|
||||
console.warn(` ⚠️ Could not read file: ${(e as Error).message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(` Loaded ${fmt(records.length)} entries`);
|
||||
|
||||
// 1. Bulk fetch existing translations for this language
|
||||
console.log(` 🔍 Fetching existing translations from DB...`);
|
||||
const existingTranslations = await db
|
||||
.select({ id: translations.id, text: translations.text, pos: terms.pos })
|
||||
.from(translations)
|
||||
.innerJoin(terms, eq(translations.term_id, terms.id))
|
||||
.where(eq(translations.language_code, language));
|
||||
|
||||
// 2. Build lookup map: "lowercase_word|pos" -> translation IDs
|
||||
const translationMap = new Map<string, string[]>();
|
||||
for (const t of existingTranslations) {
|
||||
const key = `${t.text.toLowerCase()}|${t.pos}`;
|
||||
if (!translationMap.has(key)) translationMap.set(key, []);
|
||||
translationMap.get(key)!.push(t.id);
|
||||
}
|
||||
|
||||
// 3. Match records to DB IDs and group by target (cefr, difficulty)
|
||||
const updatesByValue = new Map<string, string[]>();
|
||||
const unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }> = [];
|
||||
|
||||
for (const rec of records) {
|
||||
const key = `${rec.word.toLowerCase()}|${rec.pos}`;
|
||||
const ids = translationMap.get(key);
|
||||
|
||||
if (ids && ids.length > 0) {
|
||||
const valueKey = `${rec.cefr}|${rec.difficulty}`;
|
||||
if (!updatesByValue.has(valueKey)) updatesByValue.set(valueKey, []);
|
||||
updatesByValue.get(valueKey)!.push(...ids);
|
||||
} else {
|
||||
unmatchedWords.push({ word: rec.word, pos: rec.pos, cefr: rec.cefr });
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Batch updates grouped by (cefr, difficulty)
|
||||
let totalUpdated = 0;
|
||||
for (const [valueKey, ids] of updatesByValue.entries()) {
|
||||
const [cefr, difficulty] = valueKey.split("|") as [CEFRLevel, Difficulty];
|
||||
const uniqueIds = [...new Set(ids)]; // Deduplicate synonyms/duplicates
|
||||
|
||||
for (const idBatch of chunk(uniqueIds, BATCH_SIZE)) {
|
||||
await db
|
||||
.update(translations)
|
||||
.set({ cefr_level: cefr, difficulty })
|
||||
.where(inArray(translations.id, idBatch));
|
||||
totalUpdated += idBatch.length;
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Summary
|
||||
console.log(`\n ✅ Updated ${fmt(totalUpdated)} translations`);
|
||||
console.log(` ⚠️ Unmatched: ${fmt(unmatchedWords.length)}`);
|
||||
|
||||
if (unmatchedWords.length > 0) {
|
||||
console.log(`\n Sample unmatched words (first 20):`);
|
||||
for (const { word, pos, cefr } of unmatchedWords.slice(0, 20)) {
|
||||
console.log(` "${word}" (${pos}, ${cefr})`);
|
||||
}
|
||||
if (unmatchedWords.length > 20) {
|
||||
console.log(` ... and ${fmt(unmatchedWords.length - 20)} more`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// Main
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
||||
const main = async () => {
|
||||
console.log("##########################################");
|
||||
console.log("Glossa — CEFR Enrichment");
|
||||
console.log("##########################################\n");
|
||||
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
await enrichLanguage(lang);
|
||||
}
|
||||
|
||||
console.log("\n##########################################");
|
||||
console.log("Done");
|
||||
console.log("##########################################");
|
||||
};
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
|
|
@ -15,7 +15,7 @@ type SynsetRecord = {
|
|||
glosses: Partial<Record<LanguageCode, string[]>>;
|
||||
};
|
||||
|
||||
const dataDir = "./src/data/datafiles/";
|
||||
const dataDir = "./src/data/";
|
||||
const BATCH_SIZE = 500;
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue