From e80f291c41c96b40446711cbc91395ab3c5cc7af Mon Sep 17 00:00:00 2001 From: lila Date: Sun, 5 Apr 2026 18:57:09 +0200 Subject: [PATCH] refactoring data model --- documentation/decisions.md | 55 ++++++++++++++++- packages/db/src/db/schema.ts | 102 +++++++++++++------------------ packages/shared/src/constants.ts | 2 + 3 files changed, 95 insertions(+), 64 deletions(-) diff --git a/documentation/decisions.md b/documentation/decisions.md index fd02d4c..75e2d52 100644 --- a/documentation/decisions.md +++ b/documentation/decisions.md @@ -228,6 +228,37 @@ This is why `decks.source_language` is not just a technical detail — it is the Same translation data underneath, correctly frequency-grounded per direction. Two wordlist files, two generation script runs. +### Decks: media metadata structure (post-MVP, options documented) + +When the Media hierarchy is implemented, each media type (TV show, movie, book, song) +has different attributes. Three options considered: + +**Option A: One table with nullable columns** +All media types in one table, type-specific columns nullable. Simple but becomes a sparse +matrix as media types grow. + +**Option B: Separate table per media type** +```ts +tv_metadata: deck_id, title, season, episode +movie_metadata: deck_id, title, year +book_metadata: deck_id, title, author, year +song_metadata: deck_id, title, artist, album, year +``` +Each table has exactly the right columns. Clean and queryable, more tables to maintain. + +**Option C: JSONB for flexible attributes** +```ts +media_metadata: deck_id, media_type, title, attributes jsonb +``` +Type-specific fields in a JSON blob. No migration needed for new media types but +attributes are not schema-validated and harder to query. + +**Current recommendation:** Option A to start (few media types initially, sparse +columns manageable), migrate to Option B if the number of media types grows. +Option C only if media types become numerous and unpredictable. + +Decision deferred until Media is actually built. + ### Terms: `synset_id` nullable (not NOT NULL) **Problem:** non-WordNet terms (custom words, Wiktionary-sourced entries added later) won't have a synset ID. `NOT NULL` is too strict. @@ -254,9 +285,27 @@ Postgres allows multiple `NULL` pairs under a unique constraint, so manual entri No CHECK constraint on `source` — it is only written by controlled import scripts, not user input. A free varchar is sufficient. -### Terms: `cefr_level` column (deferred population) +### Translations: `cefr_level` column (deferred population, not on `terms`) -Added as nullable `varchar(2)` with CHECK constraint against `CEFR_LEVELS` (`A1`–`C2`). Belongs on `terms`, not `decks` — difficulty is a property of the term, not the curated list. Left null for MVP; populated later via SUBTLEX or an external CEFR wordlist. Added now while the table is small to avoid a costly backfill migration later. +CEFR difficulty is language-relative, not concept-relative. "House" in English is A1, "domicile" is also English but B2 — same concept, different words, different difficulty. Moving `cefr_level` to `translations` allows each language's word to have its own level independently. + +Added as nullable `varchar(2)` with CHECK constraint against `CEFR_LEVELS` (`A1`–`C2`) on the `translations` table. Left null for MVP; populated later via SUBTLEX or an external CEFR wordlist. Also included in the `translations` index since the quiz query filters on it: + +```ts +index("idx_translations_lang").on(table.language_code, table.cefr_level, table.term_id) +``` + +### `language_pairs` table: dropped + +Valid language pairs are already implicitly defined by `decks.source_language` + `decks.validated_languages`. The table was redundant — the same information can be derived directly from decks: + +```sql +SELECT DISTINCT source_language, unnest(validated_languages) AS target_language +FROM decks +WHERE validated_languages != '{}' +``` + +The only thing `language_pairs` added was an `active` flag to manually disable a direction. This is an edge case not needed for MVP. Dropped to remove a maintenance surface that required staying in sync with deck data. ### Schema: `categories` + `term_categories` (empty for MVP) @@ -379,7 +428,7 @@ Phase 0 complete. Phase 1 data pipeline complete. Roadmap to API implementation: -1. **Finalize data model** — apply decisions above: `synset_id` nullable, add `source` + `source_id` + `cefr_level` to `terms`, add `categories` + `term_categories` tables, add `language_code` CHECK to `translations` and `term_glosses` +1. **Finalize data model** — apply decisions above: `synset_id` nullable, add `source` + `source_id` to `terms`, add `cefr_level` to `translations`, add `categories` + `term_categories` tables, add `language_code` CHECK to `translations` and `term_glosses`, drop `language_pairs` 2. **Write and run migrations** — schema changes before any data expansion 3. **Expand data pipeline** — import all OMW languages and POS, not just English nouns with Italian translations 4. **Decide SUBTLEX → `cefr_level` mapping strategy** — raw frequency ranks need a mapping to A1–C2 bands before tiered decks are meaningful diff --git a/packages/db/src/db/schema.ts b/packages/db/src/db/schema.ts index cbc2e34..0932b62 100644 --- a/packages/db/src/db/schema.ts +++ b/packages/db/src/db/schema.ts @@ -6,7 +6,6 @@ import { varchar, unique, check, - boolean, primaryKey, index, } from "drizzle-orm/pg-core"; @@ -17,6 +16,7 @@ import { SUPPORTED_POS, SUPPORTED_LANGUAGE_CODES, CEFR_LEVELS, + SUPPORTED_DECK_TYPES, } from "@glossa/shared"; export const terms = pgTable( @@ -26,7 +26,6 @@ export const terms = pgTable( source: varchar({ length: 50 }), // 'omw', 'wiktionary', null for manual source_id: text(), // synset_id value for omw, wiktionary QID, etc. pos: varchar({ length: 20 }).notNull(), - cefr_level: varchar({ length: 2 }), created_at: timestamp({ withTimezone: true }).defaultNow().notNull(), }, (table) => [ @@ -34,10 +33,6 @@ export const terms = pgTable( "pos_check", sql`${table.pos} IN (${sql.raw(SUPPORTED_POS.map((p) => `'${p}'`).join(", "))})`, ), - check( - "cefr_check", - sql`${table.cefr_level} IN (${sql.raw(CEFR_LEVELS.map((p) => `'${p}'`).join(", "))})`, - ), unique("unique_source_id").on(table.source, table.source_id), index("idx_terms_source_pos").on(table.source, table.pos), ], @@ -76,6 +71,7 @@ export const translations = pgTable( .references(() => terms.id, { onDelete: "cascade" }), language_code: varchar({ length: 10 }).notNull(), text: text().notNull(), + cefr_level: varchar({ length: 2 }), created_at: timestamp({ withTimezone: true }).defaultNow().notNull(), }, (table) => [ @@ -88,41 +84,14 @@ export const translations = pgTable( "language_code_check", sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`, ), - index("idx_translations_lang").on(table.language_code, table.term_id), - ], -); - -export const language_pairs = pgTable( - "language_pairs", - { - id: uuid().primaryKey().defaultRandom(), - source_language: varchar({ length: 10 }).notNull(), - target_language: varchar({ length: 10 }).notNull(), - label: text(), - active: boolean().default(true).notNull(), - created_at: timestamp({ withTimezone: true }).defaultNow().notNull(), - }, - (table) => [ - unique("unique_source_target").on( - table.source_language, - table.target_language, - ), check( - "source_language_check", - sql`${table.source_language} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`, + "cefr_check", + sql`${table.cefr_level} IN (${sql.raw(CEFR_LEVELS.map((l) => `'${l}'`).join(", "))})`, ), - check( - "target_language_check", - sql`${table.target_language} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`, - ), - check( - "no_self_pair", - sql`${table.source_language} != ${table.target_language}`, - ), - index("idx_pairs_active").on( - table.active, - table.source_language, - table.target_language, + index("idx_translations_lang").on( + table.language_code, + table.cefr_level, + table.term_id, ), ], ); @@ -149,7 +118,7 @@ export const decks = pgTable( description: text(), source_language: varchar({ length: 10 }).notNull(), validated_languages: varchar({ length: 10 }).array().notNull().default([]), - is_public: boolean().default(false).notNull(), + type: varchar({ length: 20 }).notNull(), created_at: timestamp({ withTimezone: true }).defaultNow().notNull(), }, (table) => [ @@ -165,7 +134,12 @@ export const decks = pgTable( "validated_languages_excludes_source", sql`NOT (${table.source_language} = ANY(${table.validated_languages}))`, ), + check( + "deck_type_check", + sql`${table.type} IN (${sql.raw(SUPPORTED_DECK_TYPES.map((t) => `'${t}'`).join(", "))})`, + ), unique("unique_deck_name").on(table.name, table.source_language), + index("idx_decks_type").on(table.type, table.source_language), ], ); @@ -178,31 +152,37 @@ export const deck_terms = pgTable( term_id: uuid() .notNull() .references(() => terms.id, { onDelete: "cascade" }), - added_at: timestamp({ withTimezone: true }).defaultNow().notNull(), }, (table) => [primaryKey({ columns: [table.deck_id, table.term_id] })], ); +export const topics = pgTable("topics", { + id: uuid().primaryKey().defaultRandom(), + slug: varchar({ length: 50 }).notNull().unique(), + label: text().notNull(), + description: text(), + created_at: timestamp({ withTimezone: true }).defaultNow().notNull(), +}); + +export const term_topics = pgTable( + "term_topics", + { + term_id: uuid() + .notNull() + .references(() => terms.id, { onDelete: "cascade" }), + topic_id: uuid() + .notNull() + .references(() => topics.id, { onDelete: "cascade" }), + }, + (table) => [primaryKey({ columns: [table.term_id, table.topic_id] })], +); + /* - * INTENTIONAL DESIGN DECISIONS + * INTENTIONAL DESIGN DECISIONS — see decisions.md for full reasoning * - * surrogate id + synset_id (terms): - * Both exist on purpose. synset_id is the natural WordNet key used for lookups - * and re-imports. id is the stable internal FK target — if synset IDs change in - * a future WordNet version, FK references don't need to cascade. - * - * display_name UNIQUE (users): - * Unique usernames are a feature, not an oversight. One "Alex" per app. - * - * UNIQUE(term_id, language_code, text) (translations): - * This does allow synonyms. "banco" and "orilla" are different text values and - * both insert cleanly. The constraint only prevents exact duplicate rows. - * - * updated_at omitted: - * A column with DEFAULT now() that is never written on updates is misleading. - * Omitted until a trigger or ORM hook is in place to actually maintain it. - * - * FK indexes: - * All FK columns are covered — either by explicit indexes, composite unique - * indexes, or the composite PK on deck_terms. No sequential scans on joins. + * source + source_id (terms): idempotency key per import pipeline + * display_name UNIQUE (users): multiplayer requires distinguishable names + * UNIQUE(term_id, language_code, text): allows synonyms, prevents exact duplicates + * updated_at omitted: misleading without a trigger to maintain it + * FK indexes: all FK columns covered, no sequential scans on joins */ diff --git a/packages/shared/src/constants.ts b/packages/shared/src/constants.ts index 416aff4..39cfa61 100644 --- a/packages/shared/src/constants.ts +++ b/packages/shared/src/constants.ts @@ -5,3 +5,5 @@ export const SUPPORTED_POS = ["noun", "verb"] as const; export const GAME_ROUNDS = ["3", "10"] as const; export const CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"] as const; + +export const SUPPORTED_DECK_TYPES = ["grammar", "media"] as const;