formatting

2026-03-31 10:06:06 +02:00 · 2026-03-31 10:06:06 +02:00 · e3a2136720
commit e3a2136720
parent 20fa6a9331
11 changed files with 72803 additions and 408878 deletions
--- a/documentation/data-seeding-notes.md
+++ b/documentation/data-seeding-notes.md
@ -14,14 +14,12 @@ Each synset extracted from WordNet is represented as:
 {
  "synset_id": "ili:i35545",
  "pos": "noun",
-  "translations": {
-    "en": ["entity"],
-    "it": ["cosa", "entità"]
-  }
+  "translations": { "en": ["entity"], "it": ["cosa", "entità"] }
 }
 ```

 **Fields:**
+
 - `synset_id` — OMW Interlingual Index ID, maps to `terms.synset_id` in the DB
 - `pos` — part of speech, matches the CHECK constraint on `terms.pos`
 - `translations` — object of language code → array of lemmas (synonyms within a synset)
@ -53,6 +51,7 @@ translations
 ## 3. Seeding Script — v1 (batch, truncate-based)

 ### Approach
+
 - Read a single JSON file
 - Batch inserts into `terms` and `translations` in groups of 500
 - Truncate tables before each run for a clean slate
@ -60,7 +59,7 @@ translations
 ### Key decisions made during development

 | Issue                            | Resolution                                          |
-|-------|-----------|
+| -------------------------------- | --------------------------------------------------- |
 | `JSON.parse` returns `any`       | Added `Array.isArray` check before casting          |
 | `forEach` doesn't await          | Switched to `for...of`                              |
 | Empty array types                | Used Drizzle's `$inferInsert` types                 |
@ -134,7 +133,9 @@ const main = async () => {

    if (termsArray.length >= 500) {
      batchCount++;
-      console.log(`Uploading batch ${batchCount} (${batchCount * 500}/${allSynsets.length} synsets)...`);
+      console.log(
+        `Uploading batch ${batchCount} (${batchCount * 500}/${allSynsets.length} synsets)...`,
+      );
      await uploadToDB(termsArray, translationsArray);
      termsArray.length = 0;
      translationsArray.length = 0;
@ -143,7 +144,9 @@ const main = async () => {

  if (termsArray.length > 0) {
    batchCount++;
-    console.log(`Uploading final batch (${allSynsets.length}/${allSynsets.length} synsets)...`);
+    console.log(
+      `Uploading final batch (${allSynsets.length}/${allSynsets.length} synsets)...`,
+    );
    await uploadToDB(termsArray, translationsArray);
  }

@ -161,6 +164,7 @@ main().catch((error) => {
 ## 4. Pitfalls Encountered

 ### Duplicate key on re-run
+
 Running the script twice causes `duplicate key value violates unique constraint "terms_synset_id_unique"`. Fix: truncate before seeding.

 ```bash
@ -168,15 +172,19 @@ docker exec -it glossa-database psql -U glossa -d glossa -c "TRUNCATE translatio
 ```

 ### `onConflictDoNothing` breaks FK references
+
 When `onConflictDoNothing` skips a `terms` insert, the in-memory UUID is never written to the DB. Subsequent `translations` inserts reference that non-existent UUID, causing a FK violation. This is why the truncate approach is correct for batch seeding.

 ### DATABASE_URL misconfigured
+
 Correct format:
+
 ```
 DATABASE_URL=postgresql://glossa:glossa@localhost:5432/glossa
 ```

 ### Tables not found after `docker compose up`
+
 Migrations must be applied first: `npx drizzle-kit migrate`

 ---
@ -205,10 +213,13 @@ docker exec -it glossa-database psql -U glossa -d glossa -c "SELECT COUNT(*) FRO
 ## 6. Seeding Script — v2 (incremental upsert, multi-file)

 ### Motivation
+
 The truncate approach is fine for dev but unsuitable for production — it wipes all data. The v2 approach extends the database incrementally without ever truncating.

 ### File naming convention
+
 One JSON file per language pair per POS:
+
 ```
 scripts/datafiles/
  en-it-nouns.json
@ -219,7 +230,9 @@ scripts/datafiles/
 ```

 ### How incremental upsert works
+
 For a concept like "dog" already in the DB with English and Italian:
+
 1. Import `en-fr-nouns.json`
 2. Upsert `terms` by `synset_id` — finds existing row, returns its real ID
 3. `dog (en)` already exists → skipped by `onConflictDoNothing`
@ -228,6 +241,7 @@ For a concept like "dog" already in the DB with English and Italian:
 The concept is **extended**, not replaced.

 ### Tradeoff vs batch approach
+
 Batching is no longer possible since you need the real `term.id` from the DB before inserting translations. Each synset is processed individually. For 25k rows this is still fast enough.

 ### Key types added
@ -252,7 +266,9 @@ type FileName = {
 const parseFilename = (filename: string): FileName => {
  const parts = filename.replace(".json", "").split("-");
  if (parts.length !== 3)
-    throw new Error(`Invalid filename format: ${filename}. Expected: sourcelang-targetlang-pos.json`);
+    throw new Error(
+      `Invalid filename format: ${filename}. Expected: sourcelang-targetlang-pos.json`,
+    );
  const [sourceLang, targetLang, pos] = parts;
  if (!SUPPORTED_LANGUAGE_CODES.includes(sourceLang as LANGUAGE_CODE))
    throw new Error(`Unsupported language code: ${sourceLang}`);
@ -278,10 +294,7 @@ const upsertSynset = async (
  const [upsertedTerm] = await db
    .insert(terms)
    .values({ synset_id: synset.synset_id, pos: synset.pos })
-    .onConflictDoUpdate({
-      target: terms.synset_id,
-      set: { pos: synset.pos },
-    })
+    .onConflictDoUpdate({ target: terms.synset_id, set: { pos: synset.pos } })
    .returning({ id: terms.id, created_at: terms.created_at });

  const termInserted = upsertedTerm.created_at > new Date(Date.now() - 1000);
@ -311,7 +324,7 @@ const upsertSynset = async (
 ## 7. Strategy Comparison

 | Strategy           | Use case                      | Pros                  | Cons                 |
-|----------|----------|------|------|
+| ------------------ | ----------------------------- | --------------------- | -------------------- |
 | Truncate + batch   | Dev / first-time setup        | Fast, simple          | Wipes all data       |
 | Incremental upsert | Production / adding languages | Safe, non-destructive | No batching, slower  |
 | Migrations-as-data | Production audit trail        | Clean history         | Files accumulate     |
@ -331,6 +344,7 @@ The `exports` field must be an object, not an array:
 ```

 Imports then resolve as:
+
 ```ts
 import { db } from "@glossa/db";
 import { terms, translations } from "@glossa/db/schema";
--- a/documentation/decisions.md
+++ b/documentation/decisions.md
@ -61,6 +61,7 @@ Production will use Nginx to serve static Vite build output.
 **Original approach:** Store `frequency_rank` on `terms` table and filter by rank range for difficulty.

 **Problem discovered:** WordNet/OMW frequency data is unreliable for language learning. Extraction produced results like:
+
 - Rank 1: "In" → "indio" (chemical symbol: Indium)
 - Rank 2: "Be" → "berillio" (chemical symbol: Beryllium)
 - Rank 7: "He" → "elio" (chemical symbol: Helium)
@ -68,12 +69,14 @@ Production will use Nginx to serve static Vite build output.
 These are technically "common" in WordNet (every element is a noun) but useless for vocabulary learning.

 **Decision:**
+
 - `terms` table stores ALL available OMW synsets (raw data, no frequency filtering)
 - `decks` table stores curated learning lists (A1, A2, B1, "Most Common 1000", etc.)
 - `deck_terms` junction table links terms to decks with position ordering
 - `rooms.deck_id` specifies which vocabulary deck a game uses

 **Benefits:**
+
 - Curricula can come from external sources (CEFR lists, Oxford 3000, SUBTLEX)
 - Bad data (chemical symbols, obscure words) excluded at deck level, not schema level
 - Users can create custom decks later
@ -162,6 +165,7 @@ Then `sudo sysctl -p` or restart Docker.
 **Problem:** Embeds auth provider in the primary key (e.g. `"google|12345"`). If OpenAuth changes format or a second provider is added, the PK cascades through all FKs (`rooms.host_id`, `room_players.user_id`).

 **Decision:**
+
 - `users.id` = internal UUID (stable FK target)
 - `users.openauth_sub` = text UNIQUE (auth provider claim)
 - Allows adding multiple auth providers per user later without FK changes
@ -177,6 +181,7 @@ Allows multiple synonyms per language per term (e.g. "dog", "hound" for same syn
 ### Decks: `pair_id` is nullable

 `decks.pair_id` references `language_pairs` but is nullable. Reasons:
+
 - Single-language decks (e.g. "English Grammar")
 - Multi-pair decks (e.g. "Cognates" spanning EN-IT and EN-FR)
 - System decks (created by app, not tied to specific user)
@ -186,6 +191,7 @@ Allows multiple synonyms per language per term (e.g. "dog", "hound" for same syn
 **Original approach:** Store `frequency_rank` on `terms` table and filter by rank range for difficulty.

 **Problem discovered:** WordNet/OMW frequency data is unreliable for language learning. Extraction produced results like:
+
 - Rank 1: "In" → "indio" (chemical symbol: Indium)
 - Rank 2: "Be" → "berillio" (chemical symbol: Beryllium)
 - Rank 7: "He" → "elio" (chemical symbol: Helium)
@ -193,12 +199,14 @@ Allows multiple synonyms per language per term (e.g. "dog", "hound" for same syn
 These are technically "common" in WordNet (every element is a noun) but useless for vocabulary learning.

 **Decision:**
+
 - `terms` table stores ALL available OMW synsets (raw data, no frequency filtering)
 - `decks` table stores curated learning lists (A1, A2, B1, "Most Common 1000", etc.)
 - `deck_terms` junction table links terms to decks with position ordering
 - `rooms.deck_id` specifies which vocabulary deck a game uses

 **Benefits:**
+
 - Curricula can come from external sources (CEFR lists, Oxford 3000, SUBTLEX)
 - Bad data (chemical symbols, obscure words) excluded at deck level, not schema level
 - Users can create custom decks later
--- a/documentation/spec.md
+++ b/documentation/spec.md
@ -501,8 +501,6 @@ Tests are co-located with source files (`*.test.ts` / `*.test.tsx`).
 - [ ] 10–20 passing tests covering critical paths
 - [ ] pnpm workspace build pipeline green

-
-
 ---

 ## 15. Out of Scope (MVP)
--- a/packages/db/drizzle/meta/0000_snapshot.json
+++ b/packages/db/drizzle/meta/0000_snapshot.json
@ -56,12 +56,8 @@
          "name": "deck_terms_deck_id_decks_id_fk",
          "tableFrom": "deck_terms",
          "tableTo": "decks",
-          "columnsFrom": [
-            "deck_id"
-          ],
-          "columnsTo": [
-            "id"
-          ],
+          "columnsFrom": ["deck_id"],
+          "columnsTo": ["id"],
          "onDelete": "cascade",
          "onUpdate": "no action"
        },
@ -69,12 +65,8 @@
          "name": "deck_terms_term_id_terms_id_fk",
          "tableFrom": "deck_terms",
          "tableTo": "terms",
-          "columnsFrom": [
-            "term_id"
-          ],
-          "columnsTo": [
-            "id"
-          ],
+          "columnsFrom": ["term_id"],
+          "columnsTo": ["id"],
          "onDelete": "cascade",
          "onUpdate": "no action"
        }
@ -82,10 +74,7 @@
      "compositePrimaryKeys": {
        "deck_terms_deck_id_term_id_pk": {
          "name": "deck_terms_deck_id_term_id_pk",
-          "columns": [
-            "deck_id",
-            "term_id"
-          ]
+          "columns": ["deck_id", "term_id"]
        }
      },
      "uniqueConstraints": {},
@ -180,12 +169,8 @@
          "name": "decks_language_pair_id_language_pairs_id_fk",
          "tableFrom": "decks",
          "tableTo": "language_pairs",
-          "columnsFrom": [
-            "language_pair_id"
-          ],
-          "columnsTo": [
-            "id"
-          ],
+          "columnsFrom": ["language_pair_id"],
+          "columnsTo": ["id"],
          "onDelete": "cascade",
          "onUpdate": "no action"
        },
@ -193,12 +178,8 @@
          "name": "decks_created_by_users_id_fk",
          "tableFrom": "decks",
          "tableTo": "users",
-          "columnsFrom": [
-            "created_by"
-          ],
-          "columnsTo": [
-            "id"
-          ],
+          "columnsFrom": ["created_by"],
+          "columnsTo": ["id"],
          "onDelete": "cascade",
          "onUpdate": "no action"
        }
@ -208,10 +189,7 @@
        "unique_deck_name": {
          "name": "unique_deck_name",
          "nullsNotDistinct": false,
-          "columns": [
-            "name",
-            "created_by"
-          ]
+          "columns": ["name", "created_by"]
        }
      },
      "policies": {},
@ -297,10 +275,7 @@
        "unique_source_target": {
          "name": "unique_source_target",
          "nullsNotDistinct": false,
-          "columns": [
-            "source_language",
-            "target_language"
-          ]
+          "columns": ["source_language", "target_language"]
        }
      },
      "policies": {},
@ -379,12 +354,8 @@
          "name": "term_glosses_term_id_terms_id_fk",
          "tableFrom": "term_glosses",
          "tableTo": "terms",
-          "columnsFrom": [
-            "term_id"
-          ],
-          "columnsTo": [
-            "id"
-          ],
+          "columnsFrom": ["term_id"],
+          "columnsTo": ["id"],
          "onDelete": "cascade",
          "onUpdate": "no action"
        }
@ -394,11 +365,7 @@
        "unique_term_gloss": {
          "name": "unique_term_gloss",
          "nullsNotDistinct": false,
-          "columns": [
-            "term_id",
-            "language_code",
-            "text"
-          ]
+          "columns": ["term_id", "language_code", "text"]
        }
      },
      "policies": {},
@ -459,9 +426,7 @@
        "terms_synset_id_unique": {
          "name": "terms_synset_id_unique",
          "nullsNotDistinct": false,
-          "columns": [
-            "synset_id"
-          ]
+          "columns": ["synset_id"]
        }
      },
      "policies": {},
@ -538,12 +503,8 @@
          "name": "translations_term_id_terms_id_fk",
          "tableFrom": "translations",
          "tableTo": "terms",
-          "columnsFrom": [
-            "term_id"
-          ],
-          "columnsTo": [
-            "id"
-          ],
+          "columnsFrom": ["term_id"],
+          "columnsTo": ["id"],
          "onDelete": "cascade",
          "onUpdate": "no action"
        }
@ -553,11 +514,7 @@
        "unique_translations": {
          "name": "unique_translations",
          "nullsNotDistinct": false,
-          "columns": [
-            "term_id",
-            "language_code",
-            "text"
-          ]
+          "columns": ["term_id", "language_code", "text"]
        }
      },
      "policies": {},
@ -614,23 +571,17 @@
        "users_openauth_sub_unique": {
          "name": "users_openauth_sub_unique",
          "nullsNotDistinct": false,
-          "columns": [
-            "openauth_sub"
-          ]
+          "columns": ["openauth_sub"]
        },
        "users_email_unique": {
          "name": "users_email_unique",
          "nullsNotDistinct": false,
-          "columns": [
-            "email"
-          ]
+          "columns": ["email"]
        },
        "users_display_name_unique": {
          "name": "users_display_name_unique",
          "nullsNotDistinct": false,
-          "columns": [
-            "display_name"
-          ]
+          "columns": ["display_name"]
        }
      },
      "policies": {},
@ -644,9 +595,5 @@
  "roles": {},
  "policies": {},
  "views": {},
-  "_meta": {
-    "columns": {},
-    "schemas": {},
-    "tables": {}
-  }
+  "_meta": { "columns": {}, "schemas": {}, "tables": {} }
 }
--- a/packages/db/src/data/datafiles/en-it-noun.json
+++ b/packages/db/src/data/datafiles/en-it-noun.json
--- a/packages/db/tsconfig.json
+++ b/packages/db/tsconfig.json
@ -5,7 +5,7 @@
    "moduleResolution": "NodeNext",
    "outDir": "./dist",
    "resolveJsonModule": true,
-    "types": ["vitest/globals"],
+    "types": ["vitest/globals"]
  },
-  "include": ["src", "vitest.config.ts"],
+  "include": ["src", "vitest.config.ts"]
 }
--- a/packages/shared/tsconfig.json
+++ b/packages/shared/tsconfig.json
@ -5,7 +5,7 @@
    "moduleResolution": "NodeNext",
    "outDir": "./dist",
    "resolveJsonModule": true,
-    "types": ["vitest/globals"],
+    "types": ["vitest/globals"]
  },
-  "include": ["src", "vitest.config.ts"],
+  "include": ["src", "vitest.config.ts"]
 }
--- a/scripts/datafiles/en-it-noun.json
+++ b/scripts/datafiles/en-it-noun.json