- Update all package names from @glossa/* to @lila/* - Update all imports, container names, volume names - Update documentation references - Recreate database with new credentials
148 lines
5.4 KiB
TypeScript
148 lines
5.4 KiB
TypeScript
import fs from "node:fs/promises";
|
|
import { eq, inArray } from "drizzle-orm";
|
|
|
|
import {
|
|
SUPPORTED_LANGUAGE_CODES,
|
|
SUPPORTED_POS,
|
|
CEFR_LEVELS,
|
|
DIFFICULTY_LEVELS,
|
|
} from "@lila/shared";
|
|
import { db } from "@lila/db";
|
|
import { translations, terms } from "@lila/db/schema";
|
|
|
|
type POS = (typeof SUPPORTED_POS)[number];
|
|
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
|
|
type CEFRLevel = (typeof CEFR_LEVELS)[number];
|
|
type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
|
|
|
|
type MergedRecord = {
|
|
word: string;
|
|
pos: POS;
|
|
cefr: CEFRLevel;
|
|
difficulty: Difficulty;
|
|
sources: string[];
|
|
};
|
|
|
|
const dataDir = "./src/data/";
|
|
const BATCH_SIZE = 500;
|
|
|
|
// ────────────────────────────────────────────────────────────
|
|
// Helpers
|
|
// ────────────────────────────────────────────────────────────
|
|
|
|
function chunk<T>(arr: T[], size: number): T[][] {
|
|
const out: T[][] = [];
|
|
for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
|
|
return out;
|
|
}
|
|
|
|
function fmt(n: number): string {
|
|
return n.toLocaleString("en-US");
|
|
}
|
|
|
|
// ────────────────────────────────────────────────────────────
|
|
// Enrichment per language
|
|
// ────────────────────────────────────────────────────────────
|
|
|
|
async function enrichLanguage(language: LanguageCode): Promise<void> {
|
|
const filename = `${language}-merged.json`;
|
|
const filepath = dataDir + filename;
|
|
|
|
console.log(`\n📝 Enriching ${filename}...`);
|
|
|
|
let records: MergedRecord[];
|
|
try {
|
|
const raw = await fs.readFile(filepath, "utf8");
|
|
records = JSON.parse(raw) as MergedRecord[];
|
|
} catch (e) {
|
|
console.warn(` ⚠️ Could not read file: ${(e as Error).message}`);
|
|
return;
|
|
}
|
|
|
|
console.log(` Loaded ${fmt(records.length)} entries`);
|
|
|
|
// 1. Bulk fetch existing translations for this language
|
|
console.log(` 🔍 Fetching existing translations from DB...`);
|
|
const existingTranslations = await db
|
|
.select({ id: translations.id, text: translations.text, pos: terms.pos })
|
|
.from(translations)
|
|
.innerJoin(terms, eq(translations.term_id, terms.id))
|
|
.where(eq(translations.language_code, language));
|
|
|
|
// 2. Build lookup map: "lowercase_word|pos" -> translation IDs
|
|
const translationMap = new Map<string, string[]>();
|
|
for (const t of existingTranslations) {
|
|
const key = `${t.text.toLowerCase()}|${t.pos}`;
|
|
if (!translationMap.has(key)) translationMap.set(key, []);
|
|
translationMap.get(key)!.push(t.id);
|
|
}
|
|
|
|
// 3. Match records to DB IDs and group by target (cefr, difficulty)
|
|
const updatesByValue = new Map<string, string[]>();
|
|
const unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }> = [];
|
|
|
|
for (const rec of records) {
|
|
const key = `${rec.word.toLowerCase()}|${rec.pos}`;
|
|
const ids = translationMap.get(key);
|
|
|
|
if (ids && ids.length > 0) {
|
|
const valueKey = `${rec.cefr}|${rec.difficulty}`;
|
|
if (!updatesByValue.has(valueKey)) updatesByValue.set(valueKey, []);
|
|
updatesByValue.get(valueKey)!.push(...ids);
|
|
} else {
|
|
unmatchedWords.push({ word: rec.word, pos: rec.pos, cefr: rec.cefr });
|
|
}
|
|
}
|
|
|
|
// 4. Batch updates grouped by (cefr, difficulty)
|
|
let totalUpdated = 0;
|
|
for (const [valueKey, ids] of updatesByValue.entries()) {
|
|
const [cefr, difficulty] = valueKey.split("|") as [CEFRLevel, Difficulty];
|
|
const uniqueIds = [...new Set(ids)]; // Deduplicate synonyms/duplicates
|
|
|
|
for (const idBatch of chunk(uniqueIds, BATCH_SIZE)) {
|
|
await db
|
|
.update(translations)
|
|
.set({ cefr_level: cefr, difficulty })
|
|
.where(inArray(translations.id, idBatch));
|
|
totalUpdated += idBatch.length;
|
|
}
|
|
}
|
|
|
|
// 5. Summary
|
|
console.log(`\n ✅ Updated ${fmt(totalUpdated)} translations`);
|
|
console.log(` ⚠️ Unmatched: ${fmt(unmatchedWords.length)}`);
|
|
|
|
if (unmatchedWords.length > 0) {
|
|
console.log(`\n Sample unmatched words (first 20):`);
|
|
for (const { word, pos, cefr } of unmatchedWords.slice(0, 20)) {
|
|
console.log(` "${word}" (${pos}, ${cefr})`);
|
|
}
|
|
if (unmatchedWords.length > 20) {
|
|
console.log(` ... and ${fmt(unmatchedWords.length - 20)} more`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ────────────────────────────────────────────────────────────
|
|
// Main
|
|
// ────────────────────────────────────────────────────────────
|
|
|
|
const main = async () => {
|
|
console.log("##########################################");
|
|
console.log("lila — CEFR Enrichment");
|
|
console.log("##########################################\n");
|
|
|
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
|
await enrichLanguage(lang);
|
|
}
|
|
|
|
console.log("\n##########################################");
|
|
console.log("Done");
|
|
console.log("##########################################");
|
|
};
|
|
|
|
main().catch((err) => {
|
|
console.error(err);
|
|
process.exit(1);
|
|
});
|