feat: migrate production schema from OMW to Kaikki flat vocabulary model

- Replace terms/translations/term_glosses/term_examples with vocabulary_entries
  and entry_translations
- Remove decks, topics and related tables (deferred)
- Add cefr_level and difficulty to entry_translations for game query filtering
- Update termModel.ts for new schema — getDistractors now takes sourceLanguage
- Update gameService.ts and multiplayerGameService.ts for entryId rename
- Update all test fixtures from termId to entryId
- Generate and apply migration 0011
This commit is contained in:
lila 2026-05-05 17:39:25 +02:00
parent 38d8b85228
commit 963bff4eb8
10 changed files with 949 additions and 215 deletions

View file

@ -10,6 +10,7 @@ import {
index,
boolean,
integer,
smallint,
} from "drizzle-orm/pg-core";
import { sql, relations } from "drizzle-orm";
@ -18,182 +19,100 @@ import {
SUPPORTED_POS,
SUPPORTED_LANGUAGE_CODES,
CEFR_LEVELS,
SUPPORTED_DECK_TYPES,
DIFFICULTY_LEVELS,
LOBBY_STATUSES,
} from "@lila/shared";
export const terms = pgTable(
"terms",
// ── Vocabulary ────────────────────────────────────────────────────────────────
export const vocabulary_entries = pgTable(
"vocabulary_entries",
{
id: uuid().primaryKey().defaultRandom(),
source: varchar({ length: 50 }), // 'omw', 'wiktionary', null for manual
source_id: text(), // synset_id value for omw, wiktionary QID, etc.
headword: text().notNull(),
language_code: varchar({ length: 10 }).notNull(),
pos: varchar({ length: 20 }).notNull(),
sense_index: smallint().notNull().default(0),
gloss: text(),
examples: text().array().notNull().default([]),
cefr_level: varchar({ length: 2 }),
difficulty: varchar({ length: 20 }),
source: varchar({ length: 50 }).notNull().default("kaikki"),
created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
},
(table) => [
unique("unique_entry").on(
table.headword,
table.language_code,
table.pos,
table.sense_index,
),
check(
"language_code_check",
sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
),
check(
"pos_check",
sql`${table.pos} IN (${sql.raw(SUPPORTED_POS.map((p) => `'${p}'`).join(", "))})`,
),
unique("unique_source_id").on(table.source, table.source_id),
index("idx_terms_source_pos").on(table.source, table.pos),
],
);
export const term_glosses = pgTable(
"term_glosses",
{
id: uuid().primaryKey().defaultRandom(),
term_id: uuid()
.notNull()
.references(() => terms.id, { onDelete: "cascade" }),
language_code: varchar({ length: 10 }).notNull(),
text: text().notNull(),
description: text(),
created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
},
(table) => [
unique("unique_term_gloss").on(table.term_id, table.language_code),
check(
"language_code_check",
sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
),
],
);
export const term_examples = pgTable(
"term_examples",
{
id: uuid().primaryKey().defaultRandom(),
term_id: uuid()
.notNull()
.references(() => terms.id, { onDelete: "cascade" }),
language_code: varchar({ length: 10 }).notNull(),
text: text().notNull(),
created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
},
(table) => [
unique("unique_term_example").on(
table.term_id,
table.language_code,
table.text,
),
check(
"language_code_check",
sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
),
index("idx_term_examples_term_id").on(table.term_id, table.language_code),
],
);
export const translations = pgTable(
"translations",
{
id: uuid().primaryKey().defaultRandom(),
term_id: uuid()
.notNull()
.references(() => terms.id, { onDelete: "cascade" }),
language_code: varchar({ length: 10 }).notNull(),
text: text().notNull(),
cefr_level: varchar({ length: 2 }),
difficulty: varchar({ length: 20 }),
created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
},
(table) => [
unique("unique_translations").on(
table.term_id,
table.language_code,
table.text,
),
check(
"language_code_check",
sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
),
check(
"cefr_check",
sql`${table.cefr_level} IN (${sql.raw(CEFR_LEVELS.map((l) => `'${l}'`).join(", "))})`,
sql`${table.cefr_level} IS NULL OR ${table.cefr_level} IN (${sql.raw(CEFR_LEVELS.map((l) => `'${l}'`).join(", "))})`,
),
check(
"difficulty_check",
sql`${table.difficulty} IN (${sql.raw(DIFFICULTY_LEVELS.map((d) => `'${d}'`).join(", "))})`,
sql`${table.difficulty} IS NULL OR ${table.difficulty} IN (${sql.raw(DIFFICULTY_LEVELS.map((d) => `'${d}'`).join(", "))})`,
),
index("idx_translations_lang").on(
index("idx_entries_lang_pos").on(
table.language_code,
table.pos,
table.difficulty,
table.cefr_level,
table.term_id,
),
],
);
export const decks = pgTable(
"decks",
export const entry_translations = pgTable(
"entry_translations",
{
id: uuid().primaryKey().defaultRandom(),
name: text().notNull(),
description: text(),
source_language: varchar({ length: 10 }).notNull(),
validated_languages: varchar({ length: 10 }).array().notNull().default([]),
type: varchar({ length: 20 }).notNull(),
entry_id: uuid()
.notNull()
.references(() => vocabulary_entries.id, { onDelete: "cascade" }),
target_language_code: varchar({ length: 10 }).notNull(),
translation: text().notNull(),
sense_hint: text(),
cefr_level: varchar({ length: 2 }),
difficulty: varchar({ length: 20 }),
source: varchar({ length: 50 }).notNull().default("kaikki"),
created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
},
(table) => [
check(
"source_language_check",
sql`${table.source_language} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
unique("unique_translation").on(
table.entry_id,
table.target_language_code,
table.translation,
),
check(
"validated_languages_check",
sql`validated_languages <@ ARRAY[${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))}]::varchar[]`,
"target_language_code_check",
sql`${table.target_language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
),
check(
"validated_languages_excludes_source",
sql`NOT (${table.source_language} = ANY(${table.validated_languages}))`,
"cefr_check",
sql`${table.cefr_level} IS NULL OR ${table.cefr_level} IN (${sql.raw(CEFR_LEVELS.map((l) => `'${l}'`).join(", "))})`,
),
check(
"deck_type_check",
sql`${table.type} IN (${sql.raw(SUPPORTED_DECK_TYPES.map((t) => `'${t}'`).join(", "))})`,
"difficulty_check",
sql`${table.difficulty} IS NULL OR ${table.difficulty} IN (${sql.raw(DIFFICULTY_LEVELS.map((d) => `'${d}'`).join(", "))})`,
),
index("idx_translations_target_lang").on(
table.target_language_code,
table.difficulty,
table.entry_id,
),
unique("unique_deck_name").on(table.name, table.source_language),
index("idx_decks_type").on(table.type, table.source_language),
],
);
export const deck_terms = pgTable(
"deck_terms",
{
deck_id: uuid()
.notNull()
.references(() => decks.id, { onDelete: "cascade" }),
term_id: uuid()
.notNull()
.references(() => terms.id, { onDelete: "cascade" }),
},
(table) => [primaryKey({ columns: [table.deck_id, table.term_id] })],
);
export const topics = pgTable("topics", {
id: uuid().primaryKey().defaultRandom(),
slug: varchar({ length: 50 }).notNull().unique(),
label: text().notNull(),
description: text(),
created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
});
export const term_topics = pgTable(
"term_topics",
{
term_id: uuid()
.notNull()
.references(() => terms.id, { onDelete: "cascade" }),
topic_id: uuid()
.notNull()
.references(() => topics.id, { onDelete: "cascade" }),
},
(table) => [primaryKey({ columns: [table.term_id, table.topic_id] })],
);
// ── Auth (managed by Better Auth) ─────────────────────────────────────────────
export const user = pgTable("user", {
id: text("id").primaryKey(),
@ -204,7 +123,7 @@ export const user = pgTable("user", {
createdAt: timestamp("created_at").defaultNow().notNull(),
updatedAt: timestamp("updated_at")
.defaultNow()
.$onUpdate(() => /* @__PURE__ */ new Date())
.$onUpdate(() => new Date())
.notNull(),
});
@ -216,7 +135,7 @@ export const session = pgTable(
token: text("token").notNull().unique(),
createdAt: timestamp("created_at").defaultNow().notNull(),
updatedAt: timestamp("updated_at")
.$onUpdate(() => /* @__PURE__ */ new Date())
.$onUpdate(() => new Date())
.notNull(),
ipAddress: text("ip_address"),
userAgent: text("user_agent"),
@ -245,7 +164,7 @@ export const account = pgTable(
password: text("password"),
createdAt: timestamp("created_at").defaultNow().notNull(),
updatedAt: timestamp("updated_at")
.$onUpdate(() => /* @__PURE__ */ new Date())
.$onUpdate(() => new Date())
.notNull(),
},
(table) => [index("account_userId_idx").on(table.userId)],
@ -261,24 +180,13 @@ export const verification = pgTable(
createdAt: timestamp("created_at").defaultNow().notNull(),
updatedAt: timestamp("updated_at")
.defaultNow()
.$onUpdate(() => /* @__PURE__ */ new Date())
.$onUpdate(() => new Date())
.notNull(),
},
(table) => [index("verification_identifier_idx").on(table.identifier)],
);
export const userRelations = relations(user, ({ many }) => ({
sessions: many(session),
accounts: many(account),
}));
export const sessionRelations = relations(session, ({ one }) => ({
user: one(user, { fields: [session.userId], references: [user.id] }),
}));
export const accountRelations = relations(account, ({ one }) => ({
user: one(user, { fields: [account.userId], references: [user.id] }),
}));
// ── Lobbies ───────────────────────────────────────────────────────────────────
export const lobbies = pgTable(
"lobbies",
@ -318,6 +226,36 @@ export const lobby_players = pgTable(
(table) => [primaryKey({ columns: [table.lobbyId, table.userId] })],
);
// ── Relations ─────────────────────────────────────────────────────────────────
export const vocabularyEntryRelations = relations(
vocabulary_entries,
({ many }) => ({ translations: many(entry_translations) }),
);
export const entryTranslationRelations = relations(
entry_translations,
({ one }) => ({
entry: one(vocabulary_entries, {
fields: [entry_translations.entry_id],
references: [vocabulary_entries.id],
}),
}),
);
export const userRelations = relations(user, ({ many }) => ({
sessions: many(session),
accounts: many(account),
}));
export const sessionRelations = relations(session, ({ one }) => ({
user: one(user, { fields: [session.userId], references: [user.id] }),
}));
export const accountRelations = relations(account, ({ one }) => ({
user: one(user, { fields: [account.userId], references: [user.id] }),
}));
export const lobbyRelations = relations(lobbies, ({ one, many }) => ({
host: one(user, { fields: [lobbies.hostUserId], references: [user.id] }),
players: many(lobby_players),

View file

@ -1,25 +1,27 @@
import { db } from "@lila/db";
import { eq, and, isNotNull, sql, ne } from "drizzle-orm";
import { terms, translations, term_glosses } from "@lila/db/schema";
import { eq, and, ne, sql, isNotNull } from "drizzle-orm";
import { vocabulary_entries, entry_translations } from "@lila/db/schema";
import { alias } from "drizzle-orm/pg-core";
import type {
SupportedLanguageCode,
SupportedPos,
DifficultyLevel,
} from "@lila/shared";
// ── Types ─────────────────────────────────────────────────────────────────────
export type TranslationPairRow = {
termId: string;
entryId: string;
sourceText: string;
targetText: string;
sourceGloss: string | null;
};
// Note: difficulty filter is intentionally asymmetric. We filter on the target
// (answer) side only — a word can be A2 in Italian but B1 in English, and what
// matters for the learner is the difficulty of the word they're being taught.
// ── Queries ───────────────────────────────────────────────────────────────────
// Note: difficulty filter is intentionally on the target (translation) side.
// A word can be A2 in one language but B1 in another — what matters for the
// learner is the difficulty of the word they are being tested on.
export const getGameTerms = async (
sourceLanguage: SupportedLanguageCode,
targetLanguage: SupportedLanguageCode,
@ -27,53 +29,36 @@ export const getGameTerms = async (
difficulty: DifficultyLevel,
rounds: number,
): Promise<TranslationPairRow[]> => {
const sourceTranslations = alias(translations, "source_translations");
const targetTranslations = alias(translations, "target_translations");
const sourceEntries = alias(vocabulary_entries, "source_entries");
const targetTranslations = alias(entry_translations, "target_translations");
const rows = await db
.select({
termId: terms.id,
sourceText: sourceTranslations.text,
targetText: targetTranslations.text,
sourceGloss: term_glosses.text,
entryId: sourceEntries.id,
sourceText: sourceEntries.headword,
targetText: targetTranslations.translation,
sourceGloss: sourceEntries.gloss,
})
.from(terms)
.innerJoin(
sourceTranslations,
and(
eq(sourceTranslations.term_id, terms.id),
eq(sourceTranslations.language_code, sourceLanguage), // Filter here!
),
)
.from(sourceEntries)
.innerJoin(
targetTranslations,
and(
eq(targetTranslations.term_id, terms.id),
eq(targetTranslations.language_code, targetLanguage), // Filter here!
),
)
.leftJoin(
term_glosses,
and(
eq(term_glosses.term_id, terms.id),
eq(term_glosses.language_code, sourceLanguage),
eq(targetTranslations.entry_id, sourceEntries.id),
eq(targetTranslations.target_language_code, targetLanguage),
eq(targetTranslations.difficulty, difficulty),
isNotNull(targetTranslations.translation),
),
)
.where(
and(
eq(terms.pos, pos),
eq(targetTranslations.difficulty, difficulty),
isNotNull(sourceTranslations.difficulty), // Good data quality check!
eq(sourceEntries.language_code, sourceLanguage),
eq(sourceEntries.pos, pos),
isNotNull(sourceEntries.difficulty),
),
)
// TODO(post-mvp): ORDER BY RANDOM() sorts the entire filtered result set before
// applying LIMIT, which is fine at current data volumes (low thousands of rows
// after POS + difficulty filters) but degrades as the terms table grows. Once
// the database is fully populated and tagged, replace with one of:
// - TABLESAMPLE BERNOULLI(n) for approximate sampling on large tables
// - Random offset: SELECT ... OFFSET floor(random() * (SELECT count(*) ...))
// - Pre-computed random column with a btree index, reshuffled periodically
// Benchmark first — don't optimise until it actually hurts.
// TODO(post-mvp): ORDER BY RANDOM() sorts the entire filtered result set
// before applying LIMIT, which is fine at current data volumes but degrades
// as the table grows. See original termModel.ts for optimisation options.
.orderBy(sql`RANDOM()`)
.limit(rounds);
@ -81,32 +66,33 @@ export const getGameTerms = async (
};
export const getDistractors = async (
excludeTermId: string,
excludeEntryId: string,
excludeText: string,
sourceLanguage: SupportedLanguageCode,
targetLanguage: SupportedLanguageCode,
pos: SupportedPos,
difficulty: DifficultyLevel,
count: number,
): Promise<string[]> => {
const rows = await db
.select({ text: translations.text })
.from(terms)
.select({ text: entry_translations.translation })
.from(vocabulary_entries)
.innerJoin(
translations,
entry_translations,
and(
eq(translations.term_id, terms.id),
eq(translations.language_code, targetLanguage),
eq(entry_translations.entry_id, vocabulary_entries.id),
eq(entry_translations.target_language_code, targetLanguage),
eq(entry_translations.difficulty, difficulty),
),
)
.where(
and(
eq(terms.pos, pos),
eq(translations.difficulty, difficulty),
ne(terms.id, excludeTermId),
ne(translations.text, excludeText),
eq(vocabulary_entries.language_code, sourceLanguage),
eq(vocabulary_entries.pos, pos),
ne(vocabulary_entries.id, excludeEntryId),
ne(entry_translations.translation, excludeText),
),
)
// TODO(post-mvp): same ORDER BY RANDOM() concern as getGameTerms — see comment there.
.orderBy(sql`RANDOM()`)
.limit(count);