From e80f291c41c96b40446711cbc91395ab3c5cc7af Mon Sep 17 00:00:00 2001
From: lila <beiweitemderbeste@protonmail.com>
Date: Sun, 5 Apr 2026 18:57:09 +0200
Subject: [PATCH] refactoring data model

---
 documentation/decisions.md       |  55 ++++++++++++++++-
 packages/db/src/db/schema.ts     | 102 +++++++++++++------------------
 packages/shared/src/constants.ts |   2 +
 3 files changed, 95 insertions(+), 64 deletions(-)

diff --git a/documentation/decisions.md b/documentation/decisions.md
index fd02d4c..75e2d52 100644
--- a/documentation/decisions.md
+++ b/documentation/decisions.md
@@ -228,6 +228,37 @@ This is why `decks.source_language` is not just a technical detail — it is the
 
 Same translation data underneath, correctly frequency-grounded per direction. Two wordlist files, two generation script runs.
 
+### Decks: media metadata structure (post-MVP, options documented)
+
+When the Media hierarchy is implemented, each media type (TV show, movie, book, song)
+has different attributes. Three options considered:
+
+**Option A: One table with nullable columns**
+All media types in one table, type-specific columns nullable. Simple but becomes a sparse
+matrix as media types grow.
+
+**Option B: Separate table per media type**
+```ts
+tv_metadata:    deck_id, title, season, episode
+movie_metadata: deck_id, title, year
+book_metadata:  deck_id, title, author, year
+song_metadata:  deck_id, title, artist, album, year
+```
+Each table has exactly the right columns. Clean and queryable, more tables to maintain.
+
+**Option C: JSONB for flexible attributes**
+```ts
+media_metadata: deck_id, media_type, title, attributes jsonb
+```
+Type-specific fields in a JSON blob. No migration needed for new media types but
+attributes are not schema-validated and harder to query.
+
+**Current recommendation:** Option A to start (few media types initially, sparse
+columns manageable), migrate to Option B if the number of media types grows.
+Option C only if media types become numerous and unpredictable.
+
+Decision deferred until Media is actually built.
+
 ### Terms: `synset_id` nullable (not NOT NULL)
 
 **Problem:** non-WordNet terms (custom words, Wiktionary-sourced entries added later) won't have a synset ID. `NOT NULL` is too strict.
@@ -254,9 +285,27 @@ Postgres allows multiple `NULL` pairs under a unique constraint, so manual entri
 
 No CHECK constraint on `source` — it is only written by controlled import scripts, not user input. A free varchar is sufficient.
 
-### Terms: `cefr_level` column (deferred population)
+### Translations: `cefr_level` column (deferred population, not on `terms`)
 
-Added as nullable `varchar(2)` with CHECK constraint against `CEFR_LEVELS` (`A1`–`C2`). Belongs on `terms`, not `decks` — difficulty is a property of the term, not the curated list. Left null for MVP; populated later via SUBTLEX or an external CEFR wordlist. Added now while the table is small to avoid a costly backfill migration later.
+CEFR difficulty is language-relative, not concept-relative. "House" in English is A1, "domicile" is also English but B2 — same concept, different words, different difficulty. Moving `cefr_level` to `translations` allows each language's word to have its own level independently.
+
+Added as nullable `varchar(2)` with CHECK constraint against `CEFR_LEVELS` (`A1`–`C2`) on the `translations` table. Left null for MVP; populated later via SUBTLEX or an external CEFR wordlist. Also included in the `translations` index since the quiz query filters on it:
+
+```ts
+index("idx_translations_lang").on(table.language_code, table.cefr_level, table.term_id)
+```
+
+### `language_pairs` table: dropped
+
+Valid language pairs are already implicitly defined by `decks.source_language` + `decks.validated_languages`. The table was redundant — the same information can be derived directly from decks:
+
+```sql
+SELECT DISTINCT source_language, unnest(validated_languages) AS target_language
+FROM decks
+WHERE validated_languages != '{}'
+```
+
+The only thing `language_pairs` added was an `active` flag to manually disable a direction. This is an edge case not needed for MVP. Dropped to remove a maintenance surface that required staying in sync with deck data.
 
 ### Schema: `categories` + `term_categories` (empty for MVP)
 
@@ -379,7 +428,7 @@ Phase 0 complete. Phase 1 data pipeline complete.
 
 Roadmap to API implementation:
 
-1. **Finalize data model** — apply decisions above: `synset_id` nullable, add `source` + `source_id` + `cefr_level` to `terms`, add `categories` + `term_categories` tables, add `language_code` CHECK to `translations` and `term_glosses`
+1. **Finalize data model** — apply decisions above: `synset_id` nullable, add `source` + `source_id` to `terms`, add `cefr_level` to `translations`, add `categories` + `term_categories` tables, add `language_code` CHECK to `translations` and `term_glosses`, drop `language_pairs`
 2. **Write and run migrations** — schema changes before any data expansion
 3. **Expand data pipeline** — import all OMW languages and POS, not just English nouns with Italian translations
 4. **Decide SUBTLEX → `cefr_level` mapping strategy** — raw frequency ranks need a mapping to A1–C2 bands before tiered decks are meaningful
diff --git a/packages/db/src/db/schema.ts b/packages/db/src/db/schema.ts
index cbc2e34..0932b62 100644
--- a/packages/db/src/db/schema.ts
+++ b/packages/db/src/db/schema.ts
@@ -6,7 +6,6 @@ import {
   varchar,
   unique,
   check,
-  boolean,
   primaryKey,
   index,
 } from "drizzle-orm/pg-core";
@@ -17,6 +16,7 @@ import {
   SUPPORTED_POS,
   SUPPORTED_LANGUAGE_CODES,
   CEFR_LEVELS,
+  SUPPORTED_DECK_TYPES,
 } from "@glossa/shared";
 
 export const terms = pgTable(
@@ -26,7 +26,6 @@ export const terms = pgTable(
     source: varchar({ length: 50 }), // 'omw', 'wiktionary', null for manual
     source_id: text(), // synset_id value for omw, wiktionary QID, etc.
     pos: varchar({ length: 20 }).notNull(),
-    cefr_level: varchar({ length: 2 }),
     created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
   },
   (table) => [
@@ -34,10 +33,6 @@ export const terms = pgTable(
       "pos_check",
       sql`${table.pos} IN (${sql.raw(SUPPORTED_POS.map((p) => `'${p}'`).join(", "))})`,
     ),
-    check(
-      "cefr_check",
-      sql`${table.cefr_level} IN (${sql.raw(CEFR_LEVELS.map((p) => `'${p}'`).join(", "))})`,
-    ),
     unique("unique_source_id").on(table.source, table.source_id),
     index("idx_terms_source_pos").on(table.source, table.pos),
   ],
@@ -76,6 +71,7 @@ export const translations = pgTable(
       .references(() => terms.id, { onDelete: "cascade" }),
     language_code: varchar({ length: 10 }).notNull(),
     text: text().notNull(),
+    cefr_level: varchar({ length: 2 }),
     created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
   },
   (table) => [
@@ -88,41 +84,14 @@ export const translations = pgTable(
       "language_code_check",
       sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
     ),
-    index("idx_translations_lang").on(table.language_code, table.term_id),
-  ],
-);
-
-export const language_pairs = pgTable(
-  "language_pairs",
-  {
-    id: uuid().primaryKey().defaultRandom(),
-    source_language: varchar({ length: 10 }).notNull(),
-    target_language: varchar({ length: 10 }).notNull(),
-    label: text(),
-    active: boolean().default(true).notNull(),
-    created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
-  },
-  (table) => [
-    unique("unique_source_target").on(
-      table.source_language,
-      table.target_language,
-    ),
     check(
-      "source_language_check",
-      sql`${table.source_language} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
+      "cefr_check",
+      sql`${table.cefr_level} IN (${sql.raw(CEFR_LEVELS.map((l) => `'${l}'`).join(", "))})`,
     ),
-    check(
-      "target_language_check",
-      sql`${table.target_language} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
-    ),
-    check(
-      "no_self_pair",
-      sql`${table.source_language} != ${table.target_language}`,
-    ),
-    index("idx_pairs_active").on(
-      table.active,
-      table.source_language,
-      table.target_language,
+    index("idx_translations_lang").on(
+      table.language_code,
+      table.cefr_level,
+      table.term_id,
     ),
   ],
 );
@@ -149,7 +118,7 @@ export const decks = pgTable(
     description: text(),
     source_language: varchar({ length: 10 }).notNull(),
     validated_languages: varchar({ length: 10 }).array().notNull().default([]),
-    is_public: boolean().default(false).notNull(),
+    type: varchar({ length: 20 }).notNull(),
     created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
   },
   (table) => [
@@ -165,7 +134,12 @@ export const decks = pgTable(
       "validated_languages_excludes_source",
       sql`NOT (${table.source_language} = ANY(${table.validated_languages}))`,
     ),
+    check(
+      "deck_type_check",
+      sql`${table.type} IN (${sql.raw(SUPPORTED_DECK_TYPES.map((t) => `'${t}'`).join(", "))})`,
+    ),
     unique("unique_deck_name").on(table.name, table.source_language),
+    index("idx_decks_type").on(table.type, table.source_language),
   ],
 );
 
@@ -178,31 +152,37 @@ export const deck_terms = pgTable(
     term_id: uuid()
       .notNull()
       .references(() => terms.id, { onDelete: "cascade" }),
-    added_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
   },
   (table) => [primaryKey({ columns: [table.deck_id, table.term_id] })],
 );
 
+export const topics = pgTable("topics", {
+  id: uuid().primaryKey().defaultRandom(),
+  slug: varchar({ length: 50 }).notNull().unique(),
+  label: text().notNull(),
+  description: text(),
+  created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
+});
+
+export const term_topics = pgTable(
+  "term_topics",
+  {
+    term_id: uuid()
+      .notNull()
+      .references(() => terms.id, { onDelete: "cascade" }),
+    topic_id: uuid()
+      .notNull()
+      .references(() => topics.id, { onDelete: "cascade" }),
+  },
+  (table) => [primaryKey({ columns: [table.term_id, table.topic_id] })],
+);
+
 /*
- * INTENTIONAL DESIGN DECISIONS
+ * INTENTIONAL DESIGN DECISIONS — see decisions.md for full reasoning
  *
- * surrogate id + synset_id (terms):
- *   Both exist on purpose. synset_id is the natural WordNet key used for lookups
- *   and re-imports. id is the stable internal FK target — if synset IDs change in
- *   a future WordNet version, FK references don't need to cascade.
- *
- * display_name UNIQUE (users):
- *   Unique usernames are a feature, not an oversight. One "Alex" per app.
- *
- * UNIQUE(term_id, language_code, text) (translations):
- *   This does allow synonyms. "banco" and "orilla" are different text values and
- *   both insert cleanly. The constraint only prevents exact duplicate rows.
- *
- * updated_at omitted:
- *   A column with DEFAULT now() that is never written on updates is misleading.
- *   Omitted until a trigger or ORM hook is in place to actually maintain it.
- *
- * FK indexes:
- *   All FK columns are covered — either by explicit indexes, composite unique
- *   indexes, or the composite PK on deck_terms. No sequential scans on joins.
+ * source + source_id (terms): idempotency key per import pipeline
+ * display_name UNIQUE (users): multiplayer requires distinguishable names
+ * UNIQUE(term_id, language_code, text): allows synonyms, prevents exact duplicates
+ * updated_at omitted: misleading without a trigger to maintain it
+ * FK indexes: all FK columns covered, no sequential scans on joins
  */
diff --git a/packages/shared/src/constants.ts b/packages/shared/src/constants.ts
index 416aff4..39cfa61 100644
--- a/packages/shared/src/constants.ts
+++ b/packages/shared/src/constants.ts
@@ -5,3 +5,5 @@ export const SUPPORTED_POS = ["noun", "verb"] as const;
 export const GAME_ROUNDS = ["3", "10"] as const;
 
 export const CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"] as const;
+
+export const SUPPORTED_DECK_TYPES = ["grammar", "media"] as const;