From 2b177aad5b1b0a8fa14007605366bdee499860f6 Mon Sep 17 00:00:00 2001 From: lila Date: Mon, 30 Mar 2026 15:58:01 +0200 Subject: [PATCH] feat(db): add incremental upsert seed script for WordNet vocabulary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements packages/db/src/seed.ts — reads all JSON files from scripts/datafiles/, validates filenames against supported language codes and POS, and upserts synsets into and via onConflictDoNothing. Safe to re-run; produces 0 writes on a duplicate run. --- documentation/data-seeding-notes.md | 337 +++++++++ documentation/notes.md | 16 +- documentation/roadmap.md | 10 +- documentation/spec.md | 1 - packages/db/drizzle/0000_bitter_turbo.sql | 84 +++ packages/db/drizzle/meta/0000_snapshot.json | 652 ++++++++++++++++++ packages/db/drizzle/meta/_journal.json | 14 +- packages/db/package.json | 5 +- packages/db/src/db/schema.ts | 1 - packages/db/src/seeding-datafiles.ts | 203 ++++++ .../{en-it-nouns.json => en-it-noun.json} | 0 scripts/datafiles/test.json | 36 + 12 files changed, 1349 insertions(+), 10 deletions(-) create mode 100644 documentation/data-seeding-notes.md create mode 100644 packages/db/drizzle/0000_bitter_turbo.sql create mode 100644 packages/db/drizzle/meta/0000_snapshot.json create mode 100644 packages/db/src/seeding-datafiles.ts rename scripts/datafiles/{en-it-nouns.json => en-it-noun.json} (100%) create mode 100644 scripts/datafiles/test.json diff --git a/documentation/data-seeding-notes.md b/documentation/data-seeding-notes.md new file mode 100644 index 0000000..20b359a --- /dev/null +++ b/documentation/data-seeding-notes.md @@ -0,0 +1,337 @@ +# WordNet Seeding Script — Session Summary + +## Project Context + +A multiplayer English–Italian vocabulary trainer (Glossa) built with a pnpm monorepo. Vocabulary data comes from Open Multilingual Wordnet (OMW) and is extracted into JSON files, then seeded into a PostgreSQL database via Drizzle ORM. + +--- + +## 1. JSON Extraction Format + +Each synset extracted from WordNet is represented as: + +```json +{ + "synset_id": "ili:i35545", + "pos": "noun", + "translations": { + "en": ["entity"], + "it": ["cosa", "entità"] + } +} +``` + +**Fields:** +- `synset_id` — OMW Interlingual Index ID, maps to `terms.synset_id` in the DB +- `pos` — part of speech, matches the CHECK constraint on `terms.pos` +- `translations` — object of language code → array of lemmas (synonyms within a synset) + +**Glosses** are not extracted — the `term_glosses` table exists in the schema for future use but is not needed for the MVP quiz mechanic. + +--- + +## 2. Database Schema (relevant tables) + +``` +terms + id uuid PK + synset_id text UNIQUE + pos varchar(20) + created_at timestamptz + +translations + id uuid PK + term_id uuid FK → terms.id (CASCADE) + language_code varchar(10) + text text + created_at timestamptz + UNIQUE (term_id, language_code, text) +``` + +--- + +## 3. Seeding Script — v1 (batch, truncate-based) + +### Approach +- Read a single JSON file +- Batch inserts into `terms` and `translations` in groups of 500 +- Truncate tables before each run for a clean slate + +### Key decisions made during development + +| Issue | Resolution | +|-------|-----------| +| `JSON.parse` returns `any` | Added `Array.isArray` check before casting | +| `forEach` doesn't await | Switched to `for...of` | +| Empty array types | Used Drizzle's `$inferInsert` types | +| `translations` naming conflict | Renamed local variable to `translationRows` | +| Final batch not flushed | Added `if (termsArray.length > 0)` guard after loop | +| Exact batch size check `=== 500` | Changed to `>= 500` | + +### Final script structure + +```ts +import fs from "node:fs/promises"; +import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@glossa/shared"; +import { db } from "@glossa/db"; +import { terms, translations } from "@glossa/db/schema"; + +type POS = (typeof SUPPORTED_POS)[number]; +type LANGUAGE_CODE = (typeof SUPPORTED_LANGUAGE_CODES)[number]; +type TermInsert = typeof terms.$inferInsert; +type TranslationInsert = typeof translations.$inferInsert; +type Synset = { + synset_id: string; + pos: POS; + translations: Record; +}; + +const dataDir = "../../scripts/datafiles/"; + +const readFromJsonFile = async (filepath: string): Promise => { + const data = await fs.readFile(filepath, "utf8"); + const parsed = JSON.parse(data); + if (!Array.isArray(parsed)) throw new Error("Expected a JSON array"); + return parsed as Synset[]; +}; + +const uploadToDB = async ( + termsData: TermInsert[], + translationsData: TranslationInsert[], +) => { + await db.insert(terms).values(termsData); + await db.insert(translations).values(translationsData); +}; + +const main = async () => { + console.log("Reading JSON file..."); + const allSynsets = await readFromJsonFile(dataDir + "en-it-nouns.json"); + console.log(`Loaded ${allSynsets.length} synsets`); + + const termsArray: TermInsert[] = []; + const translationsArray: TranslationInsert[] = []; + let batchCount = 0; + + for (const synset of allSynsets) { + const term = { + id: crypto.randomUUID(), + synset_id: synset.synset_id, + pos: synset.pos, + }; + + const translationRows = Object.entries(synset.translations).flatMap( + ([lang, lemmas]) => + lemmas.map((lemma) => ({ + id: crypto.randomUUID(), + term_id: term.id, + language_code: lang as LANGUAGE_CODE, + text: lemma, + })), + ); + + translationsArray.push(...translationRows); + termsArray.push(term); + + if (termsArray.length >= 500) { + batchCount++; + console.log(`Uploading batch ${batchCount} (${batchCount * 500}/${allSynsets.length} synsets)...`); + await uploadToDB(termsArray, translationsArray); + termsArray.length = 0; + translationsArray.length = 0; + } + } + + if (termsArray.length > 0) { + batchCount++; + console.log(`Uploading final batch (${allSynsets.length}/${allSynsets.length} synsets)...`); + await uploadToDB(termsArray, translationsArray); + } + + console.log(`Seeding complete — ${allSynsets.length} synsets inserted`); +}; + +main().catch((error) => { + console.error(error); + process.exit(1); +}); +``` + +--- + +## 4. Pitfalls Encountered + +### Duplicate key on re-run +Running the script twice causes `duplicate key value violates unique constraint "terms_synset_id_unique"`. Fix: truncate before seeding. + +```bash +docker exec -it glossa-database psql -U glossa -d glossa -c "TRUNCATE translations, terms CASCADE;" +``` + +### `onConflictDoNothing` breaks FK references +When `onConflictDoNothing` skips a `terms` insert, the in-memory UUID is never written to the DB. Subsequent `translations` inserts reference that non-existent UUID, causing a FK violation. This is why the truncate approach is correct for batch seeding. + +### DATABASE_URL misconfigured +Correct format: +``` +DATABASE_URL=postgresql://glossa:glossa@localhost:5432/glossa +``` + +### Tables not found after `docker compose up` +Migrations must be applied first: `npx drizzle-kit migrate` + +--- + +## 5. Running the Script + +```bash +# Start the DB container +docker compose up -d postgres + +# Apply migrations +npx drizzle-kit migrate + +# Truncate existing data (if re-seeding) +docker exec -it glossa-database psql -U glossa -d glossa -c "TRUNCATE translations, terms CASCADE;" + +# Run the seed script +npx tsx src/seed-en-it-nouns.ts + +# Verify +docker exec -it glossa-database psql -U glossa -d glossa -c "SELECT COUNT(*) FROM terms; SELECT COUNT(*) FROM translations;" +``` + +--- + +## 6. Seeding Script — v2 (incremental upsert, multi-file) + +### Motivation +The truncate approach is fine for dev but unsuitable for production — it wipes all data. The v2 approach extends the database incrementally without ever truncating. + +### File naming convention +One JSON file per language pair per POS: +``` +scripts/datafiles/ + en-it-nouns.json + en-fr-nouns.json + en-it-verbs.json + de-it-nouns.json + ... +``` + +### How incremental upsert works +For a concept like "dog" already in the DB with English and Italian: +1. Import `en-fr-nouns.json` +2. Upsert `terms` by `synset_id` — finds existing row, returns its real ID +3. `dog (en)` already exists → skipped by `onConflictDoNothing` +4. `chien (fr)` is new → inserted + +The concept is **extended**, not replaced. + +### Tradeoff vs batch approach +Batching is no longer possible since you need the real `term.id` from the DB before inserting translations. Each synset is processed individually. For 25k rows this is still fast enough. + +### Key types added + +```ts +type Synset = { + synset_id: string; + pos: POS; + translations: Partial>; // Partial — file only contains subset of languages +}; + +type FileName = { + sourceLang: LANGUAGE_CODE; + targetLang: LANGUAGE_CODE; + pos: POS; +}; +``` + +### Filename validation + +```ts +const parseFilename = (filename: string): FileName => { + const parts = filename.replace(".json", "").split("-"); + if (parts.length !== 3) + throw new Error(`Invalid filename format: ${filename}. Expected: sourcelang-targetlang-pos.json`); + const [sourceLang, targetLang, pos] = parts; + if (!SUPPORTED_LANGUAGE_CODES.includes(sourceLang as LANGUAGE_CODE)) + throw new Error(`Unsupported language code: ${sourceLang}`); + if (!SUPPORTED_LANGUAGE_CODES.includes(targetLang as LANGUAGE_CODE)) + throw new Error(`Unsupported language code: ${targetLang}`); + if (!SUPPORTED_POS.includes(pos as POS)) + throw new Error(`Unsupported POS: ${pos}`); + return { + sourceLang: sourceLang as LANGUAGE_CODE, + targetLang: targetLang as LANGUAGE_CODE, + pos: pos as POS, + }; +}; +``` + +### Upsert function (WIP) + +```ts +const upsertSynset = async ( + synset: Synset, + fileInfo: FileName, +): Promise<{ termInserted: boolean; translationsInserted: number }> => { + const [upsertedTerm] = await db + .insert(terms) + .values({ synset_id: synset.synset_id, pos: synset.pos }) + .onConflictDoUpdate({ + target: terms.synset_id, + set: { pos: synset.pos }, + }) + .returning({ id: terms.id, created_at: terms.created_at }); + + const termInserted = upsertedTerm.created_at > new Date(Date.now() - 1000); + + const translationRows = Object.entries(synset.translations).flatMap( + ([lang, lemmas]) => + lemmas!.map((lemma) => ({ + id: crypto.randomUUID(), + term_id: upsertedTerm.id, + language_code: lang as LANGUAGE_CODE, + text: lemma, + })), + ); + + const result = await db + .insert(translations) + .values(translationRows) + .onConflictDoNothing() + .returning({ id: translations.id }); + + return { termInserted, translationsInserted: result.length }; +}; +``` + +--- + +## 7. Strategy Comparison + +| Strategy | Use case | Pros | Cons | +|----------|----------|------|------| +| Truncate + batch | Dev / first-time setup | Fast, simple | Wipes all data | +| Incremental upsert | Production / adding languages | Safe, non-destructive | No batching, slower | +| Migrations-as-data | Production audit trail | Clean history | Files accumulate | +| Diff-based sync | Large production datasets | Minimal writes | Complex to implement | + +--- + +## 8. packages/db — package.json exports fix + +The `exports` field must be an object, not an array: + +```json +"exports": { + ".": "./src/index.ts", + "./schema": "./src/db/schema.ts" +} +``` + +Imports then resolve as: +```ts +import { db } from "@glossa/db"; +import { terms, translations } from "@glossa/db/schema"; +``` diff --git a/documentation/notes.md b/documentation/notes.md index 4025e7a..0015152 100644 --- a/documentation/notes.md +++ b/documentation/notes.md @@ -6,7 +6,7 @@ - add this to drizzle migrartions file: ✅ ALTER TABLE terms ADD CHECK (pos IN ('noun', 'verb', 'adjective', etc)); -## open word net +## openwordnet download libraries via @@ -45,3 +45,17 @@ list all libraries: ```bash python -c "import wn; print(wn.lexicons())" ``` + +## drizzle + +generate migration file, go to packages/db, then: + +```bash +pnpm drizzle-kit generate +``` + +execute migration, go to packages/db (docker containers need to be running): + +```bash +DATABASE_URL=postgresql://username:password@localhost:5432/database pnpm drizzle-kit migrate +``` diff --git a/documentation/roadmap.md b/documentation/roadmap.md index 4171ac5..c17773c 100644 --- a/documentation/roadmap.md +++ b/documentation/roadmap.md @@ -26,17 +26,17 @@ Done when: `GET /api/decks/1/terms?limit=10` returns 10 terms from a specific de [x] Run `extract-en-it-nouns.py` locally → generates `datafiles/en-it-nouns.json` -- Import ALL available OMW noun synsets (no frequency filtering) -[ ] Write Drizzle schema: `terms`, `translations`, `language_pairs`, `term_glosses`, `decks`, `deck_terms` -[ ] Write and run migration (includes CHECK constraints for `pos`, `gloss_type`) -[ ] Write `packages/db/src/seed.ts` (imports ALL terms + translations, NO decks) -[ ] Write `scripts/build_decks.ts` (reads external CEFR lists, matches to DB, creates decks) +[x] Write Drizzle schema: `terms`, `translations`, `language_pairs`, `term_glosses`, `decks`, `deck_terms` +[x] Write and run migration (includes CHECK constraints for `pos`, `gloss_type`) +[x] Write `packages/db/src/seed.ts` (imports ALL terms + translations, NO decks) [ ] Download CEFR A1/A2 noun lists (from GitHub repos) +[ ] Write `scripts/build_decks.ts` (reads external CEFR lists, matches to DB, creates decks) [ ] Run `pnpm db:seed` → populates terms [ ] Run `pnpm db:build-decks` → creates curated decks +[ ] Define Zod response schemas in `packages/shared` [ ] Implement `DeckRepository.getTerms(deckId, limit, offset)` [ ] Implement `QuizService.attachDistractors(terms)` — same POS, server-side, no duplicates [ ] Implement `GET /language-pairs`, `GET /decks`, `GET /decks/:id/terms` endpoints -[ ] Define Zod response schemas in `packages/shared` [ ] Unit tests for `QuizService` (correct POS filtering, never includes the answer) [ ] update decisions.md diff --git a/documentation/spec.md b/documentation/spec.md index fe428d6..b7890c4 100644 --- a/documentation/spec.md +++ b/documentation/spec.md @@ -205,7 +205,6 @@ term_glosses term_id uuid FK → terms.id language_code varchar(10) -- NOT NULL text text -- NOT NULL - type varchar(20) -- CHECK (type IN ('definition', 'example')), NULLABLE created_at timestamptz DEFAULT now() language_pairs diff --git a/packages/db/drizzle/0000_bitter_turbo.sql b/packages/db/drizzle/0000_bitter_turbo.sql new file mode 100644 index 0000000..ed93e47 --- /dev/null +++ b/packages/db/drizzle/0000_bitter_turbo.sql @@ -0,0 +1,84 @@ +CREATE TABLE "deck_terms" ( + "deck_id" uuid NOT NULL, + "term_id" uuid NOT NULL, + "position" integer NOT NULL, + "added_at" timestamp with time zone DEFAULT now() NOT NULL, + CONSTRAINT "deck_terms_deck_id_term_id_pk" PRIMARY KEY("deck_id","term_id") +); +--> statement-breakpoint +CREATE TABLE "decks" ( + "id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL, + "name" text NOT NULL, + "description" text, + "language_pair_id" uuid NOT NULL, + "created_by" uuid NOT NULL, + "is_public" boolean DEFAULT false NOT NULL, + "created_at" timestamp with time zone DEFAULT now() NOT NULL, + CONSTRAINT "unique_deck_name" UNIQUE("name","created_by") +); +--> statement-breakpoint +CREATE TABLE "language_pairs" ( + "id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL, + "source_language" varchar(10) NOT NULL, + "target_language" varchar(10) NOT NULL, + "label" text, + "active" boolean DEFAULT true NOT NULL, + "created_at" timestamp with time zone DEFAULT now() NOT NULL, + CONSTRAINT "unique_source_target" UNIQUE("source_language","target_language"), + CONSTRAINT "source_language_check" CHECK ("language_pairs"."source_language" IN ('en', 'it')), + CONSTRAINT "target_language_check" CHECK ("language_pairs"."target_language" IN ('en', 'it')), + CONSTRAINT "no_self_pair" CHECK ("language_pairs"."source_language" != "language_pairs"."target_language") +); +--> statement-breakpoint +CREATE TABLE "term_glosses" ( + "id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL, + "term_id" uuid NOT NULL, + "language_code" varchar(10) NOT NULL, + "text" text NOT NULL, + "created_at" timestamp with time zone DEFAULT now() NOT NULL, + CONSTRAINT "unique_term_gloss" UNIQUE("term_id","language_code","text") +); +--> statement-breakpoint +CREATE TABLE "terms" ( + "id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL, + "synset_id" text NOT NULL, + "pos" varchar(20) NOT NULL, + "created_at" timestamp with time zone DEFAULT now() NOT NULL, + CONSTRAINT "terms_synset_id_unique" UNIQUE("synset_id"), + CONSTRAINT "pos_check" CHECK ("terms"."pos" IN ('noun')) +); +--> statement-breakpoint +CREATE TABLE "translations" ( + "id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL, + "term_id" uuid NOT NULL, + "language_code" varchar(10) NOT NULL, + "text" text NOT NULL, + "created_at" timestamp with time zone DEFAULT now() NOT NULL, + CONSTRAINT "unique_translations" UNIQUE("term_id","language_code","text") +); +--> statement-breakpoint +CREATE TABLE "users" ( + "id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL, + "openauth_sub" text NOT NULL, + "email" varchar(255), + "display_name" varchar(100), + "created_at" timestamp with time zone DEFAULT now() NOT NULL, + "last_login_at" timestamp with time zone, + CONSTRAINT "users_openauth_sub_unique" UNIQUE("openauth_sub"), + CONSTRAINT "users_email_unique" UNIQUE("email"), + CONSTRAINT "users_display_name_unique" UNIQUE("display_name") +); +--> statement-breakpoint +ALTER TABLE "deck_terms" ADD CONSTRAINT "deck_terms_deck_id_decks_id_fk" FOREIGN KEY ("deck_id") REFERENCES "public"."decks"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint +ALTER TABLE "deck_terms" ADD CONSTRAINT "deck_terms_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint +ALTER TABLE "decks" ADD CONSTRAINT "decks_language_pair_id_language_pairs_id_fk" FOREIGN KEY ("language_pair_id") REFERENCES "public"."language_pairs"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint +ALTER TABLE "decks" ADD CONSTRAINT "decks_created_by_users_id_fk" FOREIGN KEY ("created_by") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint +ALTER TABLE "term_glosses" ADD CONSTRAINT "term_glosses_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint +ALTER TABLE "translations" ADD CONSTRAINT "translations_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint +CREATE INDEX "idx_deck_terms_term" ON "deck_terms" USING btree ("term_id");--> statement-breakpoint +CREATE INDEX "idx_decks_created_by" ON "decks" USING btree ("created_by");--> statement-breakpoint +CREATE INDEX "idx_decks_language_pair" ON "decks" USING btree ("language_pair_id");--> statement-breakpoint +CREATE INDEX "idx_pairs_active" ON "language_pairs" USING btree ("active","source_language","target_language");--> statement-breakpoint +CREATE INDEX "idx_term_glosses_term" ON "term_glosses" USING btree ("term_id");--> statement-breakpoint +CREATE INDEX "idx_terms_pos" ON "terms" USING btree ("pos");--> statement-breakpoint +CREATE INDEX "idx_translations_lang" ON "translations" USING btree ("language_code","term_id"); diff --git a/packages/db/drizzle/meta/0000_snapshot.json b/packages/db/drizzle/meta/0000_snapshot.json new file mode 100644 index 0000000..3b2b99e --- /dev/null +++ b/packages/db/drizzle/meta/0000_snapshot.json @@ -0,0 +1,652 @@ +{ + "id": "5830ce3b-dc0e-44a7-83d6-bc74016ca4fd", + "prevId": "00000000-0000-0000-0000-000000000000", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.deck_terms": { + "name": "deck_terms", + "schema": "", + "columns": { + "deck_id": { + "name": "deck_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "term_id": { + "name": "term_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "position": { + "name": "position", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "added_at": { + "name": "added_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_deck_terms_term": { + "name": "idx_deck_terms_term", + "columns": [ + { + "expression": "term_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "deck_terms_deck_id_decks_id_fk": { + "name": "deck_terms_deck_id_decks_id_fk", + "tableFrom": "deck_terms", + "tableTo": "decks", + "columnsFrom": [ + "deck_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "deck_terms_term_id_terms_id_fk": { + "name": "deck_terms_term_id_terms_id_fk", + "tableFrom": "deck_terms", + "tableTo": "terms", + "columnsFrom": [ + "term_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "deck_terms_deck_id_term_id_pk": { + "name": "deck_terms_deck_id_term_id_pk", + "columns": [ + "deck_id", + "term_id" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.decks": { + "name": "decks", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "language_pair_id": { + "name": "language_pair_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "created_by": { + "name": "created_by", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "is_public": { + "name": "is_public", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_decks_created_by": { + "name": "idx_decks_created_by", + "columns": [ + { + "expression": "created_by", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_decks_language_pair": { + "name": "idx_decks_language_pair", + "columns": [ + { + "expression": "language_pair_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "decks_language_pair_id_language_pairs_id_fk": { + "name": "decks_language_pair_id_language_pairs_id_fk", + "tableFrom": "decks", + "tableTo": "language_pairs", + "columnsFrom": [ + "language_pair_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "decks_created_by_users_id_fk": { + "name": "decks_created_by_users_id_fk", + "tableFrom": "decks", + "tableTo": "users", + "columnsFrom": [ + "created_by" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_deck_name": { + "name": "unique_deck_name", + "nullsNotDistinct": false, + "columns": [ + "name", + "created_by" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.language_pairs": { + "name": "language_pairs", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "source_language": { + "name": "source_language", + "type": "varchar(10)", + "primaryKey": false, + "notNull": true + }, + "target_language": { + "name": "target_language", + "type": "varchar(10)", + "primaryKey": false, + "notNull": true + }, + "label": { + "name": "label", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "active": { + "name": "active", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_pairs_active": { + "name": "idx_pairs_active", + "columns": [ + { + "expression": "active", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "source_language", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "target_language", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_source_target": { + "name": "unique_source_target", + "nullsNotDistinct": false, + "columns": [ + "source_language", + "target_language" + ] + } + }, + "policies": {}, + "checkConstraints": { + "source_language_check": { + "name": "source_language_check", + "value": "\"language_pairs\".\"source_language\" IN ('en', 'it')" + }, + "target_language_check": { + "name": "target_language_check", + "value": "\"language_pairs\".\"target_language\" IN ('en', 'it')" + }, + "no_self_pair": { + "name": "no_self_pair", + "value": "\"language_pairs\".\"source_language\" != \"language_pairs\".\"target_language\"" + } + }, + "isRLSEnabled": false + }, + "public.term_glosses": { + "name": "term_glosses", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "term_id": { + "name": "term_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "language_code": { + "name": "language_code", + "type": "varchar(10)", + "primaryKey": false, + "notNull": true + }, + "text": { + "name": "text", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_term_glosses_term": { + "name": "idx_term_glosses_term", + "columns": [ + { + "expression": "term_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "term_glosses_term_id_terms_id_fk": { + "name": "term_glosses_term_id_terms_id_fk", + "tableFrom": "term_glosses", + "tableTo": "terms", + "columnsFrom": [ + "term_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_term_gloss": { + "name": "unique_term_gloss", + "nullsNotDistinct": false, + "columns": [ + "term_id", + "language_code", + "text" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.terms": { + "name": "terms", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "synset_id": { + "name": "synset_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "pos": { + "name": "pos", + "type": "varchar(20)", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_terms_pos": { + "name": "idx_terms_pos", + "columns": [ + { + "expression": "pos", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "terms_synset_id_unique": { + "name": "terms_synset_id_unique", + "nullsNotDistinct": false, + "columns": [ + "synset_id" + ] + } + }, + "policies": {}, + "checkConstraints": { + "pos_check": { + "name": "pos_check", + "value": "\"terms\".\"pos\" IN ('noun')" + } + }, + "isRLSEnabled": false + }, + "public.translations": { + "name": "translations", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "term_id": { + "name": "term_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "language_code": { + "name": "language_code", + "type": "varchar(10)", + "primaryKey": false, + "notNull": true + }, + "text": { + "name": "text", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_translations_lang": { + "name": "idx_translations_lang", + "columns": [ + { + "expression": "language_code", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "term_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "translations_term_id_terms_id_fk": { + "name": "translations_term_id_terms_id_fk", + "tableFrom": "translations", + "tableTo": "terms", + "columnsFrom": [ + "term_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_translations": { + "name": "unique_translations", + "nullsNotDistinct": false, + "columns": [ + "term_id", + "language_code", + "text" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.users": { + "name": "users", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "openauth_sub": { + "name": "openauth_sub", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "email": { + "name": "email", + "type": "varchar(255)", + "primaryKey": false, + "notNull": false + }, + "display_name": { + "name": "display_name", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "last_login_at": { + "name": "last_login_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "users_openauth_sub_unique": { + "name": "users_openauth_sub_unique", + "nullsNotDistinct": false, + "columns": [ + "openauth_sub" + ] + }, + "users_email_unique": { + "name": "users_email_unique", + "nullsNotDistinct": false, + "columns": [ + "email" + ] + }, + "users_display_name_unique": { + "name": "users_display_name_unique", + "nullsNotDistinct": false, + "columns": [ + "display_name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": {}, + "schemas": {}, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} \ No newline at end of file diff --git a/packages/db/drizzle/meta/_journal.json b/packages/db/drizzle/meta/_journal.json index f04877e..b1ee666 100644 --- a/packages/db/drizzle/meta/_journal.json +++ b/packages/db/drizzle/meta/_journal.json @@ -1 +1,13 @@ -{"version":"7","dialect":"postgresql","entries":[]} \ No newline at end of file +{ + "version": "7", + "dialect": "postgresql", + "entries": [ + { + "idx": 0, + "version": "7", + "when": 1774721919883, + "tag": "0000_bitter_turbo", + "breakpoints": true + } + ] +} \ No newline at end of file diff --git a/packages/db/package.json b/packages/db/package.json index 45cad40..b300763 100644 --- a/packages/db/package.json +++ b/packages/db/package.json @@ -16,5 +16,8 @@ "@types/pg": "^8.20.0", "drizzle-kit": "^0.31.10" }, - "exports": "./src/index.ts" + "exports": { + ".": "./src/index.ts", + "./schema": "./src/db/schema.ts" + } } diff --git a/packages/db/src/db/schema.ts b/packages/db/src/db/schema.ts index 6ad36d6..05848e5 100644 --- a/packages/db/src/db/schema.ts +++ b/packages/db/src/db/schema.ts @@ -74,7 +74,6 @@ export const term_glosses = pgTable( table.language_code, table.text, ), - , index("idx_term_glosses_term").on(table.term_id), ], ); diff --git a/packages/db/src/seeding-datafiles.ts b/packages/db/src/seeding-datafiles.ts new file mode 100644 index 0000000..d44d992 --- /dev/null +++ b/packages/db/src/seeding-datafiles.ts @@ -0,0 +1,203 @@ +import fs from "node:fs/promises"; +import { eq } from "drizzle-orm"; + +import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@glossa/shared"; +import { db } from "@glossa/db"; +import { terms, translations } from "@glossa/db/schema"; + +// the following generate unions of the imported const arrays +type POS = (typeof SUPPORTED_POS)[number]; +type LANGUAGE_CODE = (typeof SUPPORTED_LANGUAGE_CODES)[number]; + +type Synset = { + synset_id: string; + pos: POS; + translations: Partial>; +}; + +type FileName = { + sourceLang: LANGUAGE_CODE; + targetLang: LANGUAGE_CODE; + pos: POS; +}; + +const dataDir = "../../scripts/datafiles/"; + +const parseFilename = (filename: string): FileName => { + const parts = filename.replace(".json", "").split("-"); + if (parts.length !== 3) + throw new Error( + `Invalid filename format: ${filename}. Expected: sourcelang-targetlang-pos.json`, + ); + const [sourceLang, targetLang, pos] = parts; + if (!SUPPORTED_LANGUAGE_CODES.includes(sourceLang as LANGUAGE_CODE)) + throw new Error(`Unsupported language code: ${sourceLang}`); + if (!SUPPORTED_LANGUAGE_CODES.includes(targetLang as LANGUAGE_CODE)) + throw new Error(`Unsupported language code: ${targetLang}`); + if (!SUPPORTED_POS.includes(pos as POS)) + throw new Error(`Unsupported POS: ${pos}`); + return { + sourceLang: sourceLang as LANGUAGE_CODE, + targetLang: targetLang as LANGUAGE_CODE, + pos: pos as POS, + }; +}; + +const readFromJsonFile = async (filepath: string): Promise => { + const data = await fs.readFile(filepath, "utf8"); + const parsed = JSON.parse(data); + if (!Array.isArray(parsed)) throw new Error("Expected a JSON array"); + return parsed as Synset[]; +}; + +const uploadSynsetToDB = async ( + synset: Synset, + _fileInfo: FileName, +): Promise<{ termInserted: boolean; translationsInserted: number }> => { + // 1. Try to insert the term — skip if synset_id already exists + const inserted = await db + .insert(terms) + .values({ synset_id: synset.synset_id, pos: synset.pos }) + .onConflictDoNothing() + .returning({ id: terms.id }); + + let termId: string; + let termInserted: boolean; + + if (inserted.length > 0) { + termId = inserted[0]!.id; + termInserted = true; + } else { + // Term already exists — fetch its real DB id for the FK + const [existing] = await db + .select({ id: terms.id }) + .from(terms) + .where(eq(terms.synset_id, synset.synset_id)) + .limit(1); + if (!existing) + throw new Error(`Term not found after conflict: ${synset.synset_id}`); + termId = existing.id; + termInserted = false; + } + + // 2. Build translation rows and upsert — skip duplicates silently + const translationRows = Object.entries(synset.translations).flatMap( + ([lang, lemmas]) => + lemmas!.map((lemma) => ({ + id: crypto.randomUUID(), + term_id: termId, + language_code: lang as LANGUAGE_CODE, + text: lemma, + })), + ); + + if (translationRows.length === 0) { + return { termInserted, translationsInserted: 0 }; + } + + const result = await db + .insert(translations) + .values(translationRows) + .onConflictDoNothing() + .returning({ id: translations.id }); + + return { termInserted, translationsInserted: result.length }; +}; + +const main = async () => { + // step 1: discovering files + console.log("\n"); + console.log("\n"); + console.log("##########################################"); + console.log("step 1: discovering files"); + console.log("##########################################"); + + console.log("🔍 Scanning datafiles directory..."); + const allFiles = await fs.readdir(dataDir); + const jsonFiles = allFiles.filter((f) => f.endsWith(".json")); + + if (jsonFiles.length === 0) { + console.warn("⚠️ No JSON files found in", dataDir); + return; + } + console.log(`📁 Found ${jsonFiles.length} file(s)\n`); + + // step 2: validating filenames + console.log("\n"); + console.log("\n"); + console.log("##########################################"); + console.log("step 2: validating filenames"); + console.log("##########################################"); + const validFiles: { filename: string; fileInfo: FileName }[] = []; + for (const filename of jsonFiles) { + try { + const fileInfo = parseFilename(filename); + validFiles.push({ filename, fileInfo }); + console.log( + ` ✅ ${filename} — ${fileInfo.sourceLang} → ${fileInfo.targetLang} (${fileInfo.pos})`, + ); + } catch (e) { + console.warn(` ⚠️ Skipping ${filename}: ${(e as Error).message}`); + } + } + + if (validFiles.length === 0) { + console.error("❌ No valid files to process. Exiting."); + return; + } + + // step 3: processing each file + console.log("\n"); + console.log("\n"); + console.log("##########################################"); + console.log("step 3: processing each file"); + console.log("##########################################"); + let totalTermsInserted = 0; + let totalTranslationsInserted = 0; + + for (const [i, { filename, fileInfo }] of validFiles.entries()) { + const prefix = `[${i + 1}/${validFiles.length}]`; + + console.log(`\n${prefix} 📄 ${filename}`); + + const synsets = await readFromJsonFile(dataDir + filename); + console.log(`${prefix} Loaded ${synsets.length} synsets`); + + let fileTermsInserted = 0; + let fileTranslationsInserted = 0; + + for (const [j, synset] of synsets.entries()) { + if (j > 0 && j % 500 === 0) { + console.log(`${prefix} ⏳ ${j}/${synsets.length} synsets processed...`); + } + + const { termInserted, translationsInserted } = await uploadSynsetToDB( + synset, + fileInfo, + ); + if (termInserted) fileTermsInserted++; + fileTranslationsInserted += translationsInserted; + } + + console.log( + `${prefix} ✅ Done — ${fileTermsInserted} new terms, ${fileTranslationsInserted} new translations`, + ); + totalTermsInserted += fileTermsInserted; + totalTranslationsInserted += fileTranslationsInserted; + } + + // step 4: Final summary + console.log("\n"); + console.log("\n"); + console.log("##########################################"); + console.log("step 4: final summary"); + console.log("##########################################"); + console.log(`\n🎉 Seeding complete!`); + console.log(` Terms inserted: ${totalTermsInserted}`); + console.log(` Translations inserted: ${totalTranslationsInserted}`); +}; + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/scripts/datafiles/en-it-nouns.json b/scripts/datafiles/en-it-noun.json similarity index 100% rename from scripts/datafiles/en-it-nouns.json rename to scripts/datafiles/en-it-noun.json diff --git a/scripts/datafiles/test.json b/scripts/datafiles/test.json new file mode 100644 index 0000000..bbb4e1e --- /dev/null +++ b/scripts/datafiles/test.json @@ -0,0 +1,36 @@ +[ + { + "synset_id": "ili:i35545", + "pos": "noun", + "translations": { "en": ["entity"], "it": ["cosa", "entità"] } + }, + { + "synset_id": "ili:i35547", + "pos": "noun", + "translations": { + "en": ["abstraction", "abstract entity"], + "it": ["astrazione"] + } + }, + { + "synset_id": "ili:i35549", + "pos": "noun", + "translations": { + "en": ["object", "physical object"], + "it": ["oggetto", "cosa"] + } + }, + { + "synset_id": "ili:i35550", + "pos": "noun", + "translations": { "en": ["whole", "unit"], "it": ["insieme", "tutto"] } + }, + { + "synset_id": "ili:i35553", + "pos": "noun", + "translations": { + "en": ["organism", "being"], + "it": ["essere vivente", "organismo", "organismo vivente"] + } + } +]