feat(api): implement game terms query with double join

- Add double join on translations for source/target languages
- Left join term_glosses for optional source-language glosses
- Filter difficulty on target side only (intentionally asymmetric:
  a word's difficulty can differ between languages, and what matters
  is the difficulty of the word being learned)
- Return neutral field names (sourceText, targetText, sourceGloss)
  instead of quiz semantics; service layer maps to prompt/answer
- Tighten term_glosses unique constraint to (term_id, language_code)
  to prevent the left join from multiplying question rows
- Add TODO for ORDER BY RANDOM() scaling post-MVP
This commit is contained in:
lila 2026-04-10 18:02:03 +02:00
parent 9fc3ba375a
commit b59fac493d
4 changed files with 356 additions and 28 deletions

View file

@ -51,11 +51,7 @@ export const term_glosses = pgTable(
created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
},
(table) => [
unique("unique_term_gloss").on(
table.term_id,
table.language_code,
table.text,
),
unique("unique_term_gloss").on(table.term_id, table.language_code),
check(
"language_code_check",
sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,

View file

@ -1,6 +1,7 @@
import { db } from "@glossa/db";
import { eq, and } from "drizzle-orm";
import { terms, translations } from "@glossa/db/schema";
import { eq, and, isNotNull, sql } from "drizzle-orm";
import { terms, translations, term_glosses } from "@glossa/db/schema";
import { alias } from "drizzle-orm/pg-core";
import type {
SupportedLanguageCode,
@ -8,25 +9,73 @@ import type {
DifficultyLevel,
} from "@glossa/shared";
export type TranslationPairRow = {
termId: string;
sourceText: string;
targetText: string;
sourceGloss: string | null;
};
// Note: difficulty filter is intentionally asymmetric. We filter on the target
// (answer) side only — a word can be A2 in Italian but B1 in English, and what
// matters for the learner is the difficulty of the word they're being taught.
export const getGameTerms = async (
sourceLanguage: SupportedLanguageCode,
targetLanguage: SupportedLanguageCode,
pos: SupportedPos,
difficulty: DifficultyLevel,
count: number,
) => {
rounds: number,
): Promise<TranslationPairRow[]> => {
const sourceTranslations = alias(translations, "source_translations");
const targetTranslations = alias(translations, "target_translations");
const rows = await db
.select()
.select({
termId: terms.id,
prompt: sourceTranslations.text,
answer: targetTranslations.text,
gloss: term_glosses.text,
})
.from(terms)
.innerJoin(translations, eq(translations.term_id, terms.id))
.innerJoin(
sourceTranslations,
and(
eq(sourceTranslations.term_id, terms.id),
eq(sourceTranslations.language_code, sourceLanguage), // Filter here!
),
)
.innerJoin(
targetTranslations,
and(
eq(targetTranslations.term_id, terms.id),
eq(targetTranslations.language_code, targetLanguage), // Filter here!
),
)
.leftJoin(
term_glosses,
and(
eq(term_glosses.term_id, terms.id),
eq(term_glosses.language_code, sourceLanguage),
),
)
.where(
and(
eq(terms.pos, pos),
eq(translations.language_code, targetLanguage),
eq(translations.difficulty, difficulty),
eq(targetTranslations.difficulty, difficulty),
isNotNull(sourceTranslations.difficulty), // Good data quality check!
),
)
.limit(count);
// TODO(post-mvp): ORDER BY RANDOM() sorts the entire filtered result set before
// applying LIMIT, which is fine at current data volumes (low thousands of rows
// after POS + difficulty filters) but degrades as the terms table grows. Once
// the database is fully populated and tagged, replace with one of:
// - TABLESAMPLE BERNOULLI(n) for approximate sampling on large tables
// - Random offset: SELECT ... OFFSET floor(random() * (SELECT count(*) ...))
// - Pre-computed random column with a btree index, reshuffled periodically
// Benchmark first — don't optimise until it actually hurts.
.orderBy(sql`RANDOM()`)
.limit(rounds);
return rows;
};