From 2b177aad5b1b0a8fa14007605366bdee499860f6 Mon Sep 17 00:00:00 2001
From: lila <beiweitemderbeste@protonmail.com>
Date: Mon, 30 Mar 2026 15:58:01 +0200
Subject: [PATCH] feat(db): add incremental upsert seed script for WordNet
 vocabulary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements packages/db/src/seed.ts — reads all JSON files from
scripts/datafiles/, validates filenames against supported language
codes and POS, and upserts synsets into  and
via onConflictDoNothing. Safe to re-run; produces 0 writes on
a duplicate run.
---
 documentation/data-seeding-notes.md           | 337 +++++++++
 documentation/notes.md                        |  16 +-
 documentation/roadmap.md                      |  10 +-
 documentation/spec.md                         |   1 -
 packages/db/drizzle/0000_bitter_turbo.sql     |  84 +++
 packages/db/drizzle/meta/0000_snapshot.json   | 652 ++++++++++++++++++
 packages/db/drizzle/meta/_journal.json        |  14 +-
 packages/db/package.json                      |   5 +-
 packages/db/src/db/schema.ts                  |   1 -
 packages/db/src/seeding-datafiles.ts          | 203 ++++++
 .../{en-it-nouns.json => en-it-noun.json}     |   0
 scripts/datafiles/test.json                   |  36 +
 12 files changed, 1349 insertions(+), 10 deletions(-)
 create mode 100644 documentation/data-seeding-notes.md
 create mode 100644 packages/db/drizzle/0000_bitter_turbo.sql
 create mode 100644 packages/db/drizzle/meta/0000_snapshot.json
 create mode 100644 packages/db/src/seeding-datafiles.ts
 rename scripts/datafiles/{en-it-nouns.json => en-it-noun.json} (100%)
 create mode 100644 scripts/datafiles/test.json

diff --git a/documentation/data-seeding-notes.md b/documentation/data-seeding-notes.md
new file mode 100644
index 0000000..20b359a
--- /dev/null
+++ b/documentation/data-seeding-notes.md
@@ -0,0 +1,337 @@
+# WordNet Seeding Script — Session Summary
+
+## Project Context
+
+A multiplayer English–Italian vocabulary trainer (Glossa) built with a pnpm monorepo. Vocabulary data comes from Open Multilingual Wordnet (OMW) and is extracted into JSON files, then seeded into a PostgreSQL database via Drizzle ORM.
+
+---
+
+## 1. JSON Extraction Format
+
+Each synset extracted from WordNet is represented as:
+
+```json
+{
+  "synset_id": "ili:i35545",
+  "pos": "noun",
+  "translations": {
+    "en": ["entity"],
+    "it": ["cosa", "entità"]
+  }
+}
+```
+
+**Fields:**
+- `synset_id` — OMW Interlingual Index ID, maps to `terms.synset_id` in the DB
+- `pos` — part of speech, matches the CHECK constraint on `terms.pos`
+- `translations` — object of language code → array of lemmas (synonyms within a synset)
+
+**Glosses** are not extracted — the `term_glosses` table exists in the schema for future use but is not needed for the MVP quiz mechanic.
+
+---
+
+## 2. Database Schema (relevant tables)
+
+```
+terms
+  id          uuid PK
+  synset_id   text UNIQUE
+  pos         varchar(20)
+  created_at  timestamptz
+
+translations
+  id            uuid PK
+  term_id       uuid FK → terms.id (CASCADE)
+  language_code varchar(10)
+  text          text
+  created_at    timestamptz
+  UNIQUE (term_id, language_code, text)
+```
+
+---
+
+## 3. Seeding Script — v1 (batch, truncate-based)
+
+### Approach
+- Read a single JSON file
+- Batch inserts into `terms` and `translations` in groups of 500
+- Truncate tables before each run for a clean slate
+
+### Key decisions made during development
+
+| Issue | Resolution |
+|-------|-----------|
+| `JSON.parse` returns `any` | Added `Array.isArray` check before casting |
+| `forEach` doesn't await | Switched to `for...of` |
+| Empty array types | Used Drizzle's `$inferInsert` types |
+| `translations` naming conflict | Renamed local variable to `translationRows` |
+| Final batch not flushed | Added `if (termsArray.length > 0)` guard after loop |
+| Exact batch size check `=== 500` | Changed to `>= 500` |
+
+### Final script structure
+
+```ts
+import fs from "node:fs/promises";
+import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@glossa/shared";
+import { db } from "@glossa/db";
+import { terms, translations } from "@glossa/db/schema";
+
+type POS = (typeof SUPPORTED_POS)[number];
+type LANGUAGE_CODE = (typeof SUPPORTED_LANGUAGE_CODES)[number];
+type TermInsert = typeof terms.$inferInsert;
+type TranslationInsert = typeof translations.$inferInsert;
+type Synset = {
+  synset_id: string;
+  pos: POS;
+  translations: Record<LANGUAGE_CODE, string[]>;
+};
+
+const dataDir = "../../scripts/datafiles/";
+
+const readFromJsonFile = async (filepath: string): Promise<Synset[]> => {
+  const data = await fs.readFile(filepath, "utf8");
+  const parsed = JSON.parse(data);
+  if (!Array.isArray(parsed)) throw new Error("Expected a JSON array");
+  return parsed as Synset[];
+};
+
+const uploadToDB = async (
+  termsData: TermInsert[],
+  translationsData: TranslationInsert[],
+) => {
+  await db.insert(terms).values(termsData);
+  await db.insert(translations).values(translationsData);
+};
+
+const main = async () => {
+  console.log("Reading JSON file...");
+  const allSynsets = await readFromJsonFile(dataDir + "en-it-nouns.json");
+  console.log(`Loaded ${allSynsets.length} synsets`);
+
+  const termsArray: TermInsert[] = [];
+  const translationsArray: TranslationInsert[] = [];
+  let batchCount = 0;
+
+  for (const synset of allSynsets) {
+    const term = {
+      id: crypto.randomUUID(),
+      synset_id: synset.synset_id,
+      pos: synset.pos,
+    };
+
+    const translationRows = Object.entries(synset.translations).flatMap(
+      ([lang, lemmas]) =>
+        lemmas.map((lemma) => ({
+          id: crypto.randomUUID(),
+          term_id: term.id,
+          language_code: lang as LANGUAGE_CODE,
+          text: lemma,
+        })),
+    );
+
+    translationsArray.push(...translationRows);
+    termsArray.push(term);
+
+    if (termsArray.length >= 500) {
+      batchCount++;
+      console.log(`Uploading batch ${batchCount} (${batchCount * 500}/${allSynsets.length} synsets)...`);
+      await uploadToDB(termsArray, translationsArray);
+      termsArray.length = 0;
+      translationsArray.length = 0;
+    }
+  }
+
+  if (termsArray.length > 0) {
+    batchCount++;
+    console.log(`Uploading final batch (${allSynsets.length}/${allSynsets.length} synsets)...`);
+    await uploadToDB(termsArray, translationsArray);
+  }
+
+  console.log(`Seeding complete — ${allSynsets.length} synsets inserted`);
+};
+
+main().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});
+```
+
+---
+
+## 4. Pitfalls Encountered
+
+### Duplicate key on re-run
+Running the script twice causes `duplicate key value violates unique constraint "terms_synset_id_unique"`. Fix: truncate before seeding.
+
+```bash
+docker exec -it glossa-database psql -U glossa -d glossa -c "TRUNCATE translations, terms CASCADE;"
+```
+
+### `onConflictDoNothing` breaks FK references
+When `onConflictDoNothing` skips a `terms` insert, the in-memory UUID is never written to the DB. Subsequent `translations` inserts reference that non-existent UUID, causing a FK violation. This is why the truncate approach is correct for batch seeding.
+
+### DATABASE_URL misconfigured
+Correct format:
+```
+DATABASE_URL=postgresql://glossa:glossa@localhost:5432/glossa
+```
+
+### Tables not found after `docker compose up`
+Migrations must be applied first: `npx drizzle-kit migrate`
+
+---
+
+## 5. Running the Script
+
+```bash
+# Start the DB container
+docker compose up -d postgres
+
+# Apply migrations
+npx drizzle-kit migrate
+
+# Truncate existing data (if re-seeding)
+docker exec -it glossa-database psql -U glossa -d glossa -c "TRUNCATE translations, terms CASCADE;"
+
+# Run the seed script
+npx tsx src/seed-en-it-nouns.ts
+
+# Verify
+docker exec -it glossa-database psql -U glossa -d glossa -c "SELECT COUNT(*) FROM terms; SELECT COUNT(*) FROM translations;"
+```
+
+---
+
+## 6. Seeding Script — v2 (incremental upsert, multi-file)
+
+### Motivation
+The truncate approach is fine for dev but unsuitable for production — it wipes all data. The v2 approach extends the database incrementally without ever truncating.
+
+### File naming convention
+One JSON file per language pair per POS:
+```
+scripts/datafiles/
+  en-it-nouns.json
+  en-fr-nouns.json
+  en-it-verbs.json
+  de-it-nouns.json
+  ...
+```
+
+### How incremental upsert works
+For a concept like "dog" already in the DB with English and Italian:
+1. Import `en-fr-nouns.json`
+2. Upsert `terms` by `synset_id` — finds existing row, returns its real ID
+3. `dog (en)` already exists → skipped by `onConflictDoNothing`
+4. `chien (fr)` is new → inserted
+
+The concept is **extended**, not replaced.
+
+### Tradeoff vs batch approach
+Batching is no longer possible since you need the real `term.id` from the DB before inserting translations. Each synset is processed individually. For 25k rows this is still fast enough.
+
+### Key types added
+
+```ts
+type Synset = {
+  synset_id: string;
+  pos: POS;
+  translations: Partial<Record<LANGUAGE_CODE, string[]>>; // Partial — file only contains subset of languages
+};
+
+type FileName = {
+  sourceLang: LANGUAGE_CODE;
+  targetLang: LANGUAGE_CODE;
+  pos: POS;
+};
+```
+
+### Filename validation
+
+```ts
+const parseFilename = (filename: string): FileName => {
+  const parts = filename.replace(".json", "").split("-");
+  if (parts.length !== 3)
+    throw new Error(`Invalid filename format: ${filename}. Expected: sourcelang-targetlang-pos.json`);
+  const [sourceLang, targetLang, pos] = parts;
+  if (!SUPPORTED_LANGUAGE_CODES.includes(sourceLang as LANGUAGE_CODE))
+    throw new Error(`Unsupported language code: ${sourceLang}`);
+  if (!SUPPORTED_LANGUAGE_CODES.includes(targetLang as LANGUAGE_CODE))
+    throw new Error(`Unsupported language code: ${targetLang}`);
+  if (!SUPPORTED_POS.includes(pos as POS))
+    throw new Error(`Unsupported POS: ${pos}`);
+  return {
+    sourceLang: sourceLang as LANGUAGE_CODE,
+    targetLang: targetLang as LANGUAGE_CODE,
+    pos: pos as POS,
+  };
+};
+```
+
+### Upsert function (WIP)
+
+```ts
+const upsertSynset = async (
+  synset: Synset,
+  fileInfo: FileName,
+): Promise<{ termInserted: boolean; translationsInserted: number }> => {
+  const [upsertedTerm] = await db
+    .insert(terms)
+    .values({ synset_id: synset.synset_id, pos: synset.pos })
+    .onConflictDoUpdate({
+      target: terms.synset_id,
+      set: { pos: synset.pos },
+    })
+    .returning({ id: terms.id, created_at: terms.created_at });
+
+  const termInserted = upsertedTerm.created_at > new Date(Date.now() - 1000);
+
+  const translationRows = Object.entries(synset.translations).flatMap(
+    ([lang, lemmas]) =>
+      lemmas!.map((lemma) => ({
+        id: crypto.randomUUID(),
+        term_id: upsertedTerm.id,
+        language_code: lang as LANGUAGE_CODE,
+        text: lemma,
+      })),
+  );
+
+  const result = await db
+    .insert(translations)
+    .values(translationRows)
+    .onConflictDoNothing()
+    .returning({ id: translations.id });
+
+  return { termInserted, translationsInserted: result.length };
+};
+```
+
+---
+
+## 7. Strategy Comparison
+
+| Strategy | Use case | Pros | Cons |
+|----------|----------|------|------|
+| Truncate + batch | Dev / first-time setup | Fast, simple | Wipes all data |
+| Incremental upsert | Production / adding languages | Safe, non-destructive | No batching, slower |
+| Migrations-as-data | Production audit trail | Clean history | Files accumulate |
+| Diff-based sync | Large production datasets | Minimal writes | Complex to implement |
+
+---
+
+## 8. packages/db — package.json exports fix
+
+The `exports` field must be an object, not an array:
+
+```json
+"exports": {
+  ".": "./src/index.ts",
+  "./schema": "./src/db/schema.ts"
+}
+```
+
+Imports then resolve as:
+```ts
+import { db } from "@glossa/db";
+import { terms, translations } from "@glossa/db/schema";
+```
diff --git a/documentation/notes.md b/documentation/notes.md
index 4025e7a..0015152 100644
--- a/documentation/notes.md
+++ b/documentation/notes.md
@@ -6,7 +6,7 @@
 - add this to drizzle migrartions file:
 ✅ ALTER TABLE terms ADD CHECK (pos IN ('noun', 'verb', 'adjective', etc));
 
-## open word net
+## openwordnet
 
 download libraries via
 
@@ -45,3 +45,17 @@ list all libraries:
 ```bash
 python -c "import wn; print(wn.lexicons())"
 ```
+
+## drizzle
+
+generate migration file, go to packages/db, then:
+
+```bash
+pnpm drizzle-kit generate
+```
+
+execute migration, go to packages/db (docker containers need to be running):
+
+```bash
+DATABASE_URL=postgresql://username:password@localhost:5432/database pnpm drizzle-kit migrate
+```
diff --git a/documentation/roadmap.md b/documentation/roadmap.md
index 4171ac5..c17773c 100644
--- a/documentation/roadmap.md
+++ b/documentation/roadmap.md
@@ -26,17 +26,17 @@ Done when: `GET /api/decks/1/terms?limit=10` returns 10 terms from a specific de
 
 [x] Run `extract-en-it-nouns.py` locally → generates `datafiles/en-it-nouns.json`
     -- Import ALL available OMW noun synsets (no frequency filtering)
-[ ] Write Drizzle schema: `terms`, `translations`, `language_pairs`, `term_glosses`, `decks`, `deck_terms`
-[ ] Write and run migration (includes CHECK constraints for `pos`, `gloss_type`)
-[ ] Write `packages/db/src/seed.ts` (imports ALL terms + translations, NO decks)
-[ ] Write `scripts/build_decks.ts` (reads external CEFR lists, matches to DB, creates decks)
+[x] Write Drizzle schema: `terms`, `translations`, `language_pairs`, `term_glosses`, `decks`, `deck_terms`
+[x] Write and run migration (includes CHECK constraints for `pos`, `gloss_type`)
+[x] Write `packages/db/src/seed.ts` (imports ALL terms + translations, NO decks)
 [ ] Download CEFR A1/A2 noun lists (from GitHub repos)
+[ ] Write `scripts/build_decks.ts` (reads external CEFR lists, matches to DB, creates decks)
 [ ] Run `pnpm db:seed` → populates terms
 [ ] Run `pnpm db:build-decks` → creates curated decks
+[ ] Define Zod response schemas in `packages/shared`
 [ ] Implement `DeckRepository.getTerms(deckId, limit, offset)`
 [ ] Implement `QuizService.attachDistractors(terms)` — same POS, server-side, no duplicates
 [ ] Implement `GET /language-pairs`, `GET /decks`, `GET /decks/:id/terms` endpoints
-[ ] Define Zod response schemas in `packages/shared`
 [ ] Unit tests for `QuizService` (correct POS filtering, never includes the answer)
 [ ] update decisions.md
 
diff --git a/documentation/spec.md b/documentation/spec.md
index fe428d6..b7890c4 100644
--- a/documentation/spec.md
+++ b/documentation/spec.md
@@ -205,7 +205,6 @@ term_glosses
   term_id       uuid FK → terms.id
   language_code varchar(10)     -- NOT NULL
   text          text            -- NOT NULL
-  type          varchar(20)     -- CHECK (type IN ('definition', 'example')), NULLABLE
   created_at    timestamptz DEFAULT now()
 
 language_pairs
diff --git a/packages/db/drizzle/0000_bitter_turbo.sql b/packages/db/drizzle/0000_bitter_turbo.sql
new file mode 100644
index 0000000..ed93e47
--- /dev/null
+++ b/packages/db/drizzle/0000_bitter_turbo.sql
@@ -0,0 +1,84 @@
+CREATE TABLE "deck_terms" (
+	"deck_id" uuid NOT NULL,
+	"term_id" uuid NOT NULL,
+	"position" integer NOT NULL,
+	"added_at" timestamp with time zone DEFAULT now() NOT NULL,
+	CONSTRAINT "deck_terms_deck_id_term_id_pk" PRIMARY KEY("deck_id","term_id")
+);
+--> statement-breakpoint
+CREATE TABLE "decks" (
+	"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
+	"name" text NOT NULL,
+	"description" text,
+	"language_pair_id" uuid NOT NULL,
+	"created_by" uuid NOT NULL,
+	"is_public" boolean DEFAULT false NOT NULL,
+	"created_at" timestamp with time zone DEFAULT now() NOT NULL,
+	CONSTRAINT "unique_deck_name" UNIQUE("name","created_by")
+);
+--> statement-breakpoint
+CREATE TABLE "language_pairs" (
+	"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
+	"source_language" varchar(10) NOT NULL,
+	"target_language" varchar(10) NOT NULL,
+	"label" text,
+	"active" boolean DEFAULT true NOT NULL,
+	"created_at" timestamp with time zone DEFAULT now() NOT NULL,
+	CONSTRAINT "unique_source_target" UNIQUE("source_language","target_language"),
+	CONSTRAINT "source_language_check" CHECK ("language_pairs"."source_language" IN ('en', 'it')),
+	CONSTRAINT "target_language_check" CHECK ("language_pairs"."target_language" IN ('en', 'it')),
+	CONSTRAINT "no_self_pair" CHECK ("language_pairs"."source_language" != "language_pairs"."target_language")
+);
+--> statement-breakpoint
+CREATE TABLE "term_glosses" (
+	"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
+	"term_id" uuid NOT NULL,
+	"language_code" varchar(10) NOT NULL,
+	"text" text NOT NULL,
+	"created_at" timestamp with time zone DEFAULT now() NOT NULL,
+	CONSTRAINT "unique_term_gloss" UNIQUE("term_id","language_code","text")
+);
+--> statement-breakpoint
+CREATE TABLE "terms" (
+	"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
+	"synset_id" text NOT NULL,
+	"pos" varchar(20) NOT NULL,
+	"created_at" timestamp with time zone DEFAULT now() NOT NULL,
+	CONSTRAINT "terms_synset_id_unique" UNIQUE("synset_id"),
+	CONSTRAINT "pos_check" CHECK ("terms"."pos" IN ('noun'))
+);
+--> statement-breakpoint
+CREATE TABLE "translations" (
+	"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
+	"term_id" uuid NOT NULL,
+	"language_code" varchar(10) NOT NULL,
+	"text" text NOT NULL,
+	"created_at" timestamp with time zone DEFAULT now() NOT NULL,
+	CONSTRAINT "unique_translations" UNIQUE("term_id","language_code","text")
+);
+--> statement-breakpoint
+CREATE TABLE "users" (
+	"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
+	"openauth_sub" text NOT NULL,
+	"email" varchar(255),
+	"display_name" varchar(100),
+	"created_at" timestamp with time zone DEFAULT now() NOT NULL,
+	"last_login_at" timestamp with time zone,
+	CONSTRAINT "users_openauth_sub_unique" UNIQUE("openauth_sub"),
+	CONSTRAINT "users_email_unique" UNIQUE("email"),
+	CONSTRAINT "users_display_name_unique" UNIQUE("display_name")
+);
+--> statement-breakpoint
+ALTER TABLE "deck_terms" ADD CONSTRAINT "deck_terms_deck_id_decks_id_fk" FOREIGN KEY ("deck_id") REFERENCES "public"."decks"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
+ALTER TABLE "deck_terms" ADD CONSTRAINT "deck_terms_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
+ALTER TABLE "decks" ADD CONSTRAINT "decks_language_pair_id_language_pairs_id_fk" FOREIGN KEY ("language_pair_id") REFERENCES "public"."language_pairs"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
+ALTER TABLE "decks" ADD CONSTRAINT "decks_created_by_users_id_fk" FOREIGN KEY ("created_by") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
+ALTER TABLE "term_glosses" ADD CONSTRAINT "term_glosses_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
+ALTER TABLE "translations" ADD CONSTRAINT "translations_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
+CREATE INDEX "idx_deck_terms_term" ON "deck_terms" USING btree ("term_id");--> statement-breakpoint
+CREATE INDEX "idx_decks_created_by" ON "decks" USING btree ("created_by");--> statement-breakpoint
+CREATE INDEX "idx_decks_language_pair" ON "decks" USING btree ("language_pair_id");--> statement-breakpoint
+CREATE INDEX "idx_pairs_active" ON "language_pairs" USING btree ("active","source_language","target_language");--> statement-breakpoint
+CREATE INDEX "idx_term_glosses_term" ON "term_glosses" USING btree ("term_id");--> statement-breakpoint
+CREATE INDEX "idx_terms_pos" ON "terms" USING btree ("pos");--> statement-breakpoint
+CREATE INDEX "idx_translations_lang" ON "translations" USING btree ("language_code","term_id");
diff --git a/packages/db/drizzle/meta/0000_snapshot.json b/packages/db/drizzle/meta/0000_snapshot.json
new file mode 100644
index 0000000..3b2b99e
--- /dev/null
+++ b/packages/db/drizzle/meta/0000_snapshot.json
@@ -0,0 +1,652 @@
+{
+  "id": "5830ce3b-dc0e-44a7-83d6-bc74016ca4fd",
+  "prevId": "00000000-0000-0000-0000-000000000000",
+  "version": "7",
+  "dialect": "postgresql",
+  "tables": {
+    "public.deck_terms": {
+      "name": "deck_terms",
+      "schema": "",
+      "columns": {
+        "deck_id": {
+          "name": "deck_id",
+          "type": "uuid",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "term_id": {
+          "name": "term_id",
+          "type": "uuid",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "position": {
+          "name": "position",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "added_at": {
+          "name": "added_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "idx_deck_terms_term": {
+          "name": "idx_deck_terms_term",
+          "columns": [
+            {
+              "expression": "term_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "deck_terms_deck_id_decks_id_fk": {
+          "name": "deck_terms_deck_id_decks_id_fk",
+          "tableFrom": "deck_terms",
+          "tableTo": "decks",
+          "columnsFrom": [
+            "deck_id"
+          ],
+          "columnsTo": [
+            "id"
+          ],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        },
+        "deck_terms_term_id_terms_id_fk": {
+          "name": "deck_terms_term_id_terms_id_fk",
+          "tableFrom": "deck_terms",
+          "tableTo": "terms",
+          "columnsFrom": [
+            "term_id"
+          ],
+          "columnsTo": [
+            "id"
+          ],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {
+        "deck_terms_deck_id_term_id_pk": {
+          "name": "deck_terms_deck_id_term_id_pk",
+          "columns": [
+            "deck_id",
+            "term_id"
+          ]
+        }
+      },
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.decks": {
+      "name": "decks",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "uuid",
+          "primaryKey": true,
+          "notNull": true,
+          "default": "gen_random_uuid()"
+        },
+        "name": {
+          "name": "name",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "description": {
+          "name": "description",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "language_pair_id": {
+          "name": "language_pair_id",
+          "type": "uuid",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_by": {
+          "name": "created_by",
+          "type": "uuid",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "is_public": {
+          "name": "is_public",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "idx_decks_created_by": {
+          "name": "idx_decks_created_by",
+          "columns": [
+            {
+              "expression": "created_by",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        },
+        "idx_decks_language_pair": {
+          "name": "idx_decks_language_pair",
+          "columns": [
+            {
+              "expression": "language_pair_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "decks_language_pair_id_language_pairs_id_fk": {
+          "name": "decks_language_pair_id_language_pairs_id_fk",
+          "tableFrom": "decks",
+          "tableTo": "language_pairs",
+          "columnsFrom": [
+            "language_pair_id"
+          ],
+          "columnsTo": [
+            "id"
+          ],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        },
+        "decks_created_by_users_id_fk": {
+          "name": "decks_created_by_users_id_fk",
+          "tableFrom": "decks",
+          "tableTo": "users",
+          "columnsFrom": [
+            "created_by"
+          ],
+          "columnsTo": [
+            "id"
+          ],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "unique_deck_name": {
+          "name": "unique_deck_name",
+          "nullsNotDistinct": false,
+          "columns": [
+            "name",
+            "created_by"
+          ]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.language_pairs": {
+      "name": "language_pairs",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "uuid",
+          "primaryKey": true,
+          "notNull": true,
+          "default": "gen_random_uuid()"
+        },
+        "source_language": {
+          "name": "source_language",
+          "type": "varchar(10)",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "target_language": {
+          "name": "target_language",
+          "type": "varchar(10)",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "label": {
+          "name": "label",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "active": {
+          "name": "active",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "idx_pairs_active": {
+          "name": "idx_pairs_active",
+          "columns": [
+            {
+              "expression": "active",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            },
+            {
+              "expression": "source_language",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            },
+            {
+              "expression": "target_language",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "unique_source_target": {
+          "name": "unique_source_target",
+          "nullsNotDistinct": false,
+          "columns": [
+            "source_language",
+            "target_language"
+          ]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {
+        "source_language_check": {
+          "name": "source_language_check",
+          "value": "\"language_pairs\".\"source_language\" IN ('en', 'it')"
+        },
+        "target_language_check": {
+          "name": "target_language_check",
+          "value": "\"language_pairs\".\"target_language\" IN ('en', 'it')"
+        },
+        "no_self_pair": {
+          "name": "no_self_pair",
+          "value": "\"language_pairs\".\"source_language\" != \"language_pairs\".\"target_language\""
+        }
+      },
+      "isRLSEnabled": false
+    },
+    "public.term_glosses": {
+      "name": "term_glosses",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "uuid",
+          "primaryKey": true,
+          "notNull": true,
+          "default": "gen_random_uuid()"
+        },
+        "term_id": {
+          "name": "term_id",
+          "type": "uuid",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "language_code": {
+          "name": "language_code",
+          "type": "varchar(10)",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "text": {
+          "name": "text",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "idx_term_glosses_term": {
+          "name": "idx_term_glosses_term",
+          "columns": [
+            {
+              "expression": "term_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "term_glosses_term_id_terms_id_fk": {
+          "name": "term_glosses_term_id_terms_id_fk",
+          "tableFrom": "term_glosses",
+          "tableTo": "terms",
+          "columnsFrom": [
+            "term_id"
+          ],
+          "columnsTo": [
+            "id"
+          ],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "unique_term_gloss": {
+          "name": "unique_term_gloss",
+          "nullsNotDistinct": false,
+          "columns": [
+            "term_id",
+            "language_code",
+            "text"
+          ]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.terms": {
+      "name": "terms",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "uuid",
+          "primaryKey": true,
+          "notNull": true,
+          "default": "gen_random_uuid()"
+        },
+        "synset_id": {
+          "name": "synset_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "pos": {
+          "name": "pos",
+          "type": "varchar(20)",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "idx_terms_pos": {
+          "name": "idx_terms_pos",
+          "columns": [
+            {
+              "expression": "pos",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "terms_synset_id_unique": {
+          "name": "terms_synset_id_unique",
+          "nullsNotDistinct": false,
+          "columns": [
+            "synset_id"
+          ]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {
+        "pos_check": {
+          "name": "pos_check",
+          "value": "\"terms\".\"pos\" IN ('noun')"
+        }
+      },
+      "isRLSEnabled": false
+    },
+    "public.translations": {
+      "name": "translations",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "uuid",
+          "primaryKey": true,
+          "notNull": true,
+          "default": "gen_random_uuid()"
+        },
+        "term_id": {
+          "name": "term_id",
+          "type": "uuid",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "language_code": {
+          "name": "language_code",
+          "type": "varchar(10)",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "text": {
+          "name": "text",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "idx_translations_lang": {
+          "name": "idx_translations_lang",
+          "columns": [
+            {
+              "expression": "language_code",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            },
+            {
+              "expression": "term_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "translations_term_id_terms_id_fk": {
+          "name": "translations_term_id_terms_id_fk",
+          "tableFrom": "translations",
+          "tableTo": "terms",
+          "columnsFrom": [
+            "term_id"
+          ],
+          "columnsTo": [
+            "id"
+          ],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "unique_translations": {
+          "name": "unique_translations",
+          "nullsNotDistinct": false,
+          "columns": [
+            "term_id",
+            "language_code",
+            "text"
+          ]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.users": {
+      "name": "users",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "uuid",
+          "primaryKey": true,
+          "notNull": true,
+          "default": "gen_random_uuid()"
+        },
+        "openauth_sub": {
+          "name": "openauth_sub",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "email": {
+          "name": "email",
+          "type": "varchar(255)",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "display_name": {
+          "name": "display_name",
+          "type": "varchar(100)",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "last_login_at": {
+          "name": "last_login_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "users_openauth_sub_unique": {
+          "name": "users_openauth_sub_unique",
+          "nullsNotDistinct": false,
+          "columns": [
+            "openauth_sub"
+          ]
+        },
+        "users_email_unique": {
+          "name": "users_email_unique",
+          "nullsNotDistinct": false,
+          "columns": [
+            "email"
+          ]
+        },
+        "users_display_name_unique": {
+          "name": "users_display_name_unique",
+          "nullsNotDistinct": false,
+          "columns": [
+            "display_name"
+          ]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    }
+  },
+  "enums": {},
+  "schemas": {},
+  "sequences": {},
+  "roles": {},
+  "policies": {},
+  "views": {},
+  "_meta": {
+    "columns": {},
+    "schemas": {},
+    "tables": {}
+  }
+}
\ No newline at end of file
diff --git a/packages/db/drizzle/meta/_journal.json b/packages/db/drizzle/meta/_journal.json
index f04877e..b1ee666 100644
--- a/packages/db/drizzle/meta/_journal.json
+++ b/packages/db/drizzle/meta/_journal.json
@@ -1 +1,13 @@
-{"version":"7","dialect":"postgresql","entries":[]}
\ No newline at end of file
+{
+  "version": "7",
+  "dialect": "postgresql",
+  "entries": [
+    {
+      "idx": 0,
+      "version": "7",
+      "when": 1774721919883,
+      "tag": "0000_bitter_turbo",
+      "breakpoints": true
+    }
+  ]
+}
\ No newline at end of file
diff --git a/packages/db/package.json b/packages/db/package.json
index 45cad40..b300763 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -16,5 +16,8 @@
     "@types/pg": "^8.20.0",
     "drizzle-kit": "^0.31.10"
   },
-  "exports": "./src/index.ts"
+  "exports": {
+    ".": "./src/index.ts",
+    "./schema": "./src/db/schema.ts"
+  }
 }
diff --git a/packages/db/src/db/schema.ts b/packages/db/src/db/schema.ts
index 6ad36d6..05848e5 100644
--- a/packages/db/src/db/schema.ts
+++ b/packages/db/src/db/schema.ts
@@ -74,7 +74,6 @@ export const term_glosses = pgTable(
       table.language_code,
       table.text,
     ),
-    ,
     index("idx_term_glosses_term").on(table.term_id),
   ],
 );
diff --git a/packages/db/src/seeding-datafiles.ts b/packages/db/src/seeding-datafiles.ts
new file mode 100644
index 0000000..d44d992
--- /dev/null
+++ b/packages/db/src/seeding-datafiles.ts
@@ -0,0 +1,203 @@
+import fs from "node:fs/promises";
+import { eq } from "drizzle-orm";
+
+import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@glossa/shared";
+import { db } from "@glossa/db";
+import { terms, translations } from "@glossa/db/schema";
+
+// the following generate unions of the imported const arrays
+type POS = (typeof SUPPORTED_POS)[number];
+type LANGUAGE_CODE = (typeof SUPPORTED_LANGUAGE_CODES)[number];
+
+type Synset = {
+  synset_id: string;
+  pos: POS;
+  translations: Partial<Record<LANGUAGE_CODE, string[]>>;
+};
+
+type FileName = {
+  sourceLang: LANGUAGE_CODE;
+  targetLang: LANGUAGE_CODE;
+  pos: POS;
+};
+
+const dataDir = "../../scripts/datafiles/";
+
+const parseFilename = (filename: string): FileName => {
+  const parts = filename.replace(".json", "").split("-");
+  if (parts.length !== 3)
+    throw new Error(
+      `Invalid filename format: ${filename}. Expected: sourcelang-targetlang-pos.json`,
+    );
+  const [sourceLang, targetLang, pos] = parts;
+  if (!SUPPORTED_LANGUAGE_CODES.includes(sourceLang as LANGUAGE_CODE))
+    throw new Error(`Unsupported language code: ${sourceLang}`);
+  if (!SUPPORTED_LANGUAGE_CODES.includes(targetLang as LANGUAGE_CODE))
+    throw new Error(`Unsupported language code: ${targetLang}`);
+  if (!SUPPORTED_POS.includes(pos as POS))
+    throw new Error(`Unsupported POS: ${pos}`);
+  return {
+    sourceLang: sourceLang as LANGUAGE_CODE,
+    targetLang: targetLang as LANGUAGE_CODE,
+    pos: pos as POS,
+  };
+};
+
+const readFromJsonFile = async (filepath: string): Promise<Synset[]> => {
+  const data = await fs.readFile(filepath, "utf8");
+  const parsed = JSON.parse(data);
+  if (!Array.isArray(parsed)) throw new Error("Expected a JSON array");
+  return parsed as Synset[];
+};
+
+const uploadSynsetToDB = async (
+  synset: Synset,
+  _fileInfo: FileName,
+): Promise<{ termInserted: boolean; translationsInserted: number }> => {
+  // 1. Try to insert the term — skip if synset_id already exists
+  const inserted = await db
+    .insert(terms)
+    .values({ synset_id: synset.synset_id, pos: synset.pos })
+    .onConflictDoNothing()
+    .returning({ id: terms.id });
+
+  let termId: string;
+  let termInserted: boolean;
+
+  if (inserted.length > 0) {
+    termId = inserted[0]!.id;
+    termInserted = true;
+  } else {
+    // Term already exists — fetch its real DB id for the FK
+    const [existing] = await db
+      .select({ id: terms.id })
+      .from(terms)
+      .where(eq(terms.synset_id, synset.synset_id))
+      .limit(1);
+    if (!existing)
+      throw new Error(`Term not found after conflict: ${synset.synset_id}`);
+    termId = existing.id;
+    termInserted = false;
+  }
+
+  // 2. Build translation rows and upsert — skip duplicates silently
+  const translationRows = Object.entries(synset.translations).flatMap(
+    ([lang, lemmas]) =>
+      lemmas!.map((lemma) => ({
+        id: crypto.randomUUID(),
+        term_id: termId,
+        language_code: lang as LANGUAGE_CODE,
+        text: lemma,
+      })),
+  );
+
+  if (translationRows.length === 0) {
+    return { termInserted, translationsInserted: 0 };
+  }
+
+  const result = await db
+    .insert(translations)
+    .values(translationRows)
+    .onConflictDoNothing()
+    .returning({ id: translations.id });
+
+  return { termInserted, translationsInserted: result.length };
+};
+
+const main = async () => {
+  // step 1: discovering files
+  console.log("\n");
+  console.log("\n");
+  console.log("##########################################");
+  console.log("step 1: discovering files");
+  console.log("##########################################");
+
+  console.log("🔍 Scanning datafiles directory...");
+  const allFiles = await fs.readdir(dataDir);
+  const jsonFiles = allFiles.filter((f) => f.endsWith(".json"));
+
+  if (jsonFiles.length === 0) {
+    console.warn("⚠️  No JSON files found in", dataDir);
+    return;
+  }
+  console.log(`📁 Found ${jsonFiles.length} file(s)\n`);
+
+  // step 2: validating filenames
+  console.log("\n");
+  console.log("\n");
+  console.log("##########################################");
+  console.log("step 2: validating filenames");
+  console.log("##########################################");
+  const validFiles: { filename: string; fileInfo: FileName }[] = [];
+  for (const filename of jsonFiles) {
+    try {
+      const fileInfo = parseFilename(filename);
+      validFiles.push({ filename, fileInfo });
+      console.log(
+        `  ✅ ${filename} — ${fileInfo.sourceLang} → ${fileInfo.targetLang} (${fileInfo.pos})`,
+      );
+    } catch (e) {
+      console.warn(`  ⚠️  Skipping ${filename}: ${(e as Error).message}`);
+    }
+  }
+
+  if (validFiles.length === 0) {
+    console.error("❌ No valid files to process. Exiting.");
+    return;
+  }
+
+  // step 3: processing each file
+  console.log("\n");
+  console.log("\n");
+  console.log("##########################################");
+  console.log("step 3: processing each file");
+  console.log("##########################################");
+  let totalTermsInserted = 0;
+  let totalTranslationsInserted = 0;
+
+  for (const [i, { filename, fileInfo }] of validFiles.entries()) {
+    const prefix = `[${i + 1}/${validFiles.length}]`;
+
+    console.log(`\n${prefix} 📄 ${filename}`);
+
+    const synsets = await readFromJsonFile(dataDir + filename);
+    console.log(`${prefix} Loaded ${synsets.length} synsets`);
+
+    let fileTermsInserted = 0;
+    let fileTranslationsInserted = 0;
+
+    for (const [j, synset] of synsets.entries()) {
+      if (j > 0 && j % 500 === 0) {
+        console.log(`${prefix} ⏳ ${j}/${synsets.length} synsets processed...`);
+      }
+
+      const { termInserted, translationsInserted } = await uploadSynsetToDB(
+        synset,
+        fileInfo,
+      );
+      if (termInserted) fileTermsInserted++;
+      fileTranslationsInserted += translationsInserted;
+    }
+
+    console.log(
+      `${prefix} ✅ Done — ${fileTermsInserted} new terms, ${fileTranslationsInserted} new translations`,
+    );
+    totalTermsInserted += fileTermsInserted;
+    totalTranslationsInserted += fileTranslationsInserted;
+  }
+
+  // step 4: Final summary
+  console.log("\n");
+  console.log("\n");
+  console.log("##########################################");
+  console.log("step 4: final summary");
+  console.log("##########################################");
+  console.log(`\n🎉 Seeding complete!`);
+  console.log(`   Terms inserted:        ${totalTermsInserted}`);
+  console.log(`   Translations inserted: ${totalTranslationsInserted}`);
+};
+
+main().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});
diff --git a/scripts/datafiles/en-it-nouns.json b/scripts/datafiles/en-it-noun.json
similarity index 100%
rename from scripts/datafiles/en-it-nouns.json
rename to scripts/datafiles/en-it-noun.json
diff --git a/scripts/datafiles/test.json b/scripts/datafiles/test.json
new file mode 100644
index 0000000..bbb4e1e
--- /dev/null
+++ b/scripts/datafiles/test.json
@@ -0,0 +1,36 @@
+[
+  {
+    "synset_id": "ili:i35545",
+    "pos": "noun",
+    "translations": { "en": ["entity"], "it": ["cosa", "entità"] }
+  },
+  {
+    "synset_id": "ili:i35547",
+    "pos": "noun",
+    "translations": {
+      "en": ["abstraction", "abstract entity"],
+      "it": ["astrazione"]
+    }
+  },
+  {
+    "synset_id": "ili:i35549",
+    "pos": "noun",
+    "translations": {
+      "en": ["object", "physical object"],
+      "it": ["oggetto", "cosa"]
+    }
+  },
+  {
+    "synset_id": "ili:i35550",
+    "pos": "noun",
+    "translations": { "en": ["whole", "unit"], "it": ["insieme", "tutto"] }
+  },
+  {
+    "synset_id": "ili:i35553",
+    "pos": "noun",
+    "translations": {
+      "en": ["organism", "being"],
+      "it": ["essere vivente", "organismo", "organismo vivente"]
+    }
+  }
+]