feat: migrate production schema from OMW to Kaikki flat vocabulary model

- Replace terms/translations/term_glosses/term_examples with vocabulary_entries
  and entry_translations
- Remove decks, topics and related tables (deferred)
- Add cefr_level and difficulty to entry_translations for game query filtering
- Update termModel.ts for new schema — getDistractors now takes sourceLanguage
- Update gameService.ts and multiplayerGameService.ts for entryId rename
- Update all test fixtures from termId to entryId
- Generate and apply migration 0011
This commit is contained in:
lila 2026-05-05 17:39:25 +02:00
parent 38d8b85228
commit 963bff4eb8
10 changed files with 949 additions and 215 deletions

View file

@ -1,25 +1,27 @@
import { db } from "@lila/db";
import { eq, and, isNotNull, sql, ne } from "drizzle-orm";
import { terms, translations, term_glosses } from "@lila/db/schema";
import { eq, and, ne, sql, isNotNull } from "drizzle-orm";
import { vocabulary_entries, entry_translations } from "@lila/db/schema";
import { alias } from "drizzle-orm/pg-core";
import type {
SupportedLanguageCode,
SupportedPos,
DifficultyLevel,
} from "@lila/shared";
// ── Types ─────────────────────────────────────────────────────────────────────
export type TranslationPairRow = {
termId: string;
entryId: string;
sourceText: string;
targetText: string;
sourceGloss: string | null;
};
// Note: difficulty filter is intentionally asymmetric. We filter on the target
// (answer) side only — a word can be A2 in Italian but B1 in English, and what
// matters for the learner is the difficulty of the word they're being taught.
// ── Queries ───────────────────────────────────────────────────────────────────
// Note: difficulty filter is intentionally on the target (translation) side.
// A word can be A2 in one language but B1 in another — what matters for the
// learner is the difficulty of the word they are being tested on.
export const getGameTerms = async (
sourceLanguage: SupportedLanguageCode,
targetLanguage: SupportedLanguageCode,
@ -27,53 +29,36 @@ export const getGameTerms = async (
difficulty: DifficultyLevel,
rounds: number,
): Promise<TranslationPairRow[]> => {
const sourceTranslations = alias(translations, "source_translations");
const targetTranslations = alias(translations, "target_translations");
const sourceEntries = alias(vocabulary_entries, "source_entries");
const targetTranslations = alias(entry_translations, "target_translations");
const rows = await db
.select({
termId: terms.id,
sourceText: sourceTranslations.text,
targetText: targetTranslations.text,
sourceGloss: term_glosses.text,
entryId: sourceEntries.id,
sourceText: sourceEntries.headword,
targetText: targetTranslations.translation,
sourceGloss: sourceEntries.gloss,
})
.from(terms)
.innerJoin(
sourceTranslations,
and(
eq(sourceTranslations.term_id, terms.id),
eq(sourceTranslations.language_code, sourceLanguage), // Filter here!
),
)
.from(sourceEntries)
.innerJoin(
targetTranslations,
and(
eq(targetTranslations.term_id, terms.id),
eq(targetTranslations.language_code, targetLanguage), // Filter here!
),
)
.leftJoin(
term_glosses,
and(
eq(term_glosses.term_id, terms.id),
eq(term_glosses.language_code, sourceLanguage),
eq(targetTranslations.entry_id, sourceEntries.id),
eq(targetTranslations.target_language_code, targetLanguage),
eq(targetTranslations.difficulty, difficulty),
isNotNull(targetTranslations.translation),
),
)
.where(
and(
eq(terms.pos, pos),
eq(targetTranslations.difficulty, difficulty),
isNotNull(sourceTranslations.difficulty), // Good data quality check!
eq(sourceEntries.language_code, sourceLanguage),
eq(sourceEntries.pos, pos),
isNotNull(sourceEntries.difficulty),
),
)
// TODO(post-mvp): ORDER BY RANDOM() sorts the entire filtered result set before
// applying LIMIT, which is fine at current data volumes (low thousands of rows
// after POS + difficulty filters) but degrades as the terms table grows. Once
// the database is fully populated and tagged, replace with one of:
// - TABLESAMPLE BERNOULLI(n) for approximate sampling on large tables
// - Random offset: SELECT ... OFFSET floor(random() * (SELECT count(*) ...))
// - Pre-computed random column with a btree index, reshuffled periodically
// Benchmark first — don't optimise until it actually hurts.
// TODO(post-mvp): ORDER BY RANDOM() sorts the entire filtered result set
// before applying LIMIT, which is fine at current data volumes but degrades
// as the table grows. See original termModel.ts for optimisation options.
.orderBy(sql`RANDOM()`)
.limit(rounds);
@ -81,32 +66,33 @@ export const getGameTerms = async (
};
export const getDistractors = async (
excludeTermId: string,
excludeEntryId: string,
excludeText: string,
sourceLanguage: SupportedLanguageCode,
targetLanguage: SupportedLanguageCode,
pos: SupportedPos,
difficulty: DifficultyLevel,
count: number,
): Promise<string[]> => {
const rows = await db
.select({ text: translations.text })
.from(terms)
.select({ text: entry_translations.translation })
.from(vocabulary_entries)
.innerJoin(
translations,
entry_translations,
and(
eq(translations.term_id, terms.id),
eq(translations.language_code, targetLanguage),
eq(entry_translations.entry_id, vocabulary_entries.id),
eq(entry_translations.target_language_code, targetLanguage),
eq(entry_translations.difficulty, difficulty),
),
)
.where(
and(
eq(terms.pos, pos),
eq(translations.difficulty, difficulty),
ne(terms.id, excludeTermId),
ne(translations.text, excludeText),
eq(vocabulary_entries.language_code, sourceLanguage),
eq(vocabulary_entries.pos, pos),
ne(vocabulary_entries.id, excludeEntryId),
ne(entry_translations.translation, excludeText),
),
)
// TODO(post-mvp): same ORDER BY RANDOM() concern as getGameTerms — see comment there.
.orderBy(sql`RANDOM()`)
.limit(count);