diff --git a/documentation/notes.md b/documentation/notes.md index 364d75c..4ff408e 100644 --- a/documentation/notes.md +++ b/documentation/notes.md @@ -4,8 +4,6 @@ - pinning dependencies in package.json files - rethink organisation of datafiles and wordlists -- add this to drizzle migrations file: - ✅ ALTER TABLE terms ADD CHECK (pos IN ('noun', 'verb', 'adjective', etc)); ## openwordnet diff --git a/documentation/roadmap.md b/documentation/roadmap.md index 5fba9ee..db130f7 100644 --- a/documentation/roadmap.md +++ b/documentation/roadmap.md @@ -29,10 +29,9 @@ Done when: `GET /api/decks/1/terms?limit=10` returns 10 terms from a specific de [x] Write and run migration (includes CHECK constraints for `pos`, `gloss_type`) [x] Write `packages/db/src/seed.ts` (imports ALL terms + translations, NO decks) [x] Download CEFR A1/A2 noun lists (from GitHub repos) -[ ] Write `scripts/build_decks.ts` (reads external CEFR lists, matches to DB, creates decks) -[ ] check notes.md -[ ] Run `pnpm db:seed` → populates terms -[ ] Run `pnpm db:build-decks` → creates curated decks +[x] Write `scripts/build_decks.ts` (reads external CEFR lists, matches to DB, creates decks) +[x] Run `pnpm db:seed` → populates terms +[x] Run `pnpm db:build-deck` → creates curated decks [ ] Define Zod response schemas in `packages/shared` [ ] Implement `DeckRepository.getTerms(deckId, limit, offset)` [ ] Implement `QuizService.attachDistractors(terms)` — same POS, server-side, no duplicates diff --git a/packages/db/drizzle/0000_bitter_turbo.sql b/packages/db/drizzle/0000_faithful_oracle.sql similarity index 83% rename from packages/db/drizzle/0000_bitter_turbo.sql rename to packages/db/drizzle/0000_faithful_oracle.sql index ed93e47..d55e3b3 100644 --- a/packages/db/drizzle/0000_bitter_turbo.sql +++ b/packages/db/drizzle/0000_faithful_oracle.sql @@ -1,7 +1,6 @@ CREATE TABLE "deck_terms" ( "deck_id" uuid NOT NULL, "term_id" uuid NOT NULL, - "position" integer NOT NULL, "added_at" timestamp with time zone DEFAULT now() NOT NULL, CONSTRAINT "deck_terms_deck_id_term_id_pk" PRIMARY KEY("deck_id","term_id") ); @@ -10,11 +9,14 @@ CREATE TABLE "decks" ( "id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL, "name" text NOT NULL, "description" text, - "language_pair_id" uuid NOT NULL, - "created_by" uuid NOT NULL, + "source_language" varchar(10) NOT NULL, + "validated_languages" varchar(10)[] DEFAULT '{}' NOT NULL, "is_public" boolean DEFAULT false NOT NULL, "created_at" timestamp with time zone DEFAULT now() NOT NULL, - CONSTRAINT "unique_deck_name" UNIQUE("name","created_by") + CONSTRAINT "unique_deck_name" UNIQUE("name","source_language"), + CONSTRAINT "source_language_check" CHECK ("decks"."source_language" IN ('en', 'it')), + CONSTRAINT "validated_languages_check" CHECK (validated_languages <@ ARRAY['en', 'it']::varchar[]), + CONSTRAINT "validated_languages_excludes_source" CHECK (NOT ("decks"."source_language" = ANY("decks"."validated_languages"))) ); --> statement-breakpoint CREATE TABLE "language_pairs" ( @@ -71,14 +73,10 @@ CREATE TABLE "users" ( --> statement-breakpoint ALTER TABLE "deck_terms" ADD CONSTRAINT "deck_terms_deck_id_decks_id_fk" FOREIGN KEY ("deck_id") REFERENCES "public"."decks"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint ALTER TABLE "deck_terms" ADD CONSTRAINT "deck_terms_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint -ALTER TABLE "decks" ADD CONSTRAINT "decks_language_pair_id_language_pairs_id_fk" FOREIGN KEY ("language_pair_id") REFERENCES "public"."language_pairs"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint -ALTER TABLE "decks" ADD CONSTRAINT "decks_created_by_users_id_fk" FOREIGN KEY ("created_by") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint ALTER TABLE "term_glosses" ADD CONSTRAINT "term_glosses_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint ALTER TABLE "translations" ADD CONSTRAINT "translations_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint CREATE INDEX "idx_deck_terms_term" ON "deck_terms" USING btree ("term_id");--> statement-breakpoint -CREATE INDEX "idx_decks_created_by" ON "decks" USING btree ("created_by");--> statement-breakpoint -CREATE INDEX "idx_decks_language_pair" ON "decks" USING btree ("language_pair_id");--> statement-breakpoint CREATE INDEX "idx_pairs_active" ON "language_pairs" USING btree ("active","source_language","target_language");--> statement-breakpoint CREATE INDEX "idx_term_glosses_term" ON "term_glosses" USING btree ("term_id");--> statement-breakpoint CREATE INDEX "idx_terms_pos" ON "terms" USING btree ("pos");--> statement-breakpoint -CREATE INDEX "idx_translations_lang" ON "translations" USING btree ("language_code","term_id"); +CREATE INDEX "idx_translations_lang" ON "translations" USING btree ("language_code","term_id"); \ No newline at end of file diff --git a/packages/db/drizzle/0001_medical_fabian_cortez.sql b/packages/db/drizzle/0001_medical_fabian_cortez.sql deleted file mode 100644 index f202780..0000000 --- a/packages/db/drizzle/0001_medical_fabian_cortez.sql +++ /dev/null @@ -1,13 +0,0 @@ -ALTER TABLE "decks" DROP CONSTRAINT "unique_deck_name";--> statement-breakpoint -ALTER TABLE "decks" DROP CONSTRAINT "decks_language_pair_id_language_pairs_id_fk"; ---> statement-breakpoint -ALTER TABLE "decks" DROP CONSTRAINT "decks_created_by_users_id_fk"; ---> statement-breakpoint -DROP INDEX "idx_decks_created_by";--> statement-breakpoint -DROP INDEX "idx_decks_language_pair";--> statement-breakpoint -ALTER TABLE "decks" ADD COLUMN "validated_for_languages" varchar(10)[] DEFAULT '{}' NOT NULL;--> statement-breakpoint -ALTER TABLE "deck_terms" DROP COLUMN "position";--> statement-breakpoint -ALTER TABLE "decks" DROP COLUMN "language_pair_id";--> statement-breakpoint -ALTER TABLE "decks" DROP COLUMN "created_by";--> statement-breakpoint -ALTER TABLE "decks" ADD CONSTRAINT "unique_deck_name" UNIQUE("name");--> statement-breakpoint -ALTER TABLE "decks" ADD CONSTRAINT "validated_languages_check" CHECK (validated_for_languages <@ ARRAY['en', 'it']::varchar[]); \ No newline at end of file diff --git a/packages/db/drizzle/meta/0000_snapshot.json b/packages/db/drizzle/meta/0000_snapshot.json index 81d5b02..4ca9a86 100644 --- a/packages/db/drizzle/meta/0000_snapshot.json +++ b/packages/db/drizzle/meta/0000_snapshot.json @@ -1,5 +1,5 @@ { - "id": "5830ce3b-dc0e-44a7-83d6-bc74016ca4fd", + "id": "9ef7c86d-9e64-42d6-9731-2c1794ab063e", "prevId": "00000000-0000-0000-0000-000000000000", "version": "7", "dialect": "postgresql", @@ -20,12 +20,6 @@ "primaryKey": false, "notNull": true }, - "position": { - "name": "position", - "type": "integer", - "primaryKey": false, - "notNull": true - }, "added_at": { "name": "added_at", "type": "timestamp with time zone", @@ -56,8 +50,12 @@ "name": "deck_terms_deck_id_decks_id_fk", "tableFrom": "deck_terms", "tableTo": "decks", - "columnsFrom": ["deck_id"], - "columnsTo": ["id"], + "columnsFrom": [ + "deck_id" + ], + "columnsTo": [ + "id" + ], "onDelete": "cascade", "onUpdate": "no action" }, @@ -65,8 +63,12 @@ "name": "deck_terms_term_id_terms_id_fk", "tableFrom": "deck_terms", "tableTo": "terms", - "columnsFrom": ["term_id"], - "columnsTo": ["id"], + "columnsFrom": [ + "term_id" + ], + "columnsTo": [ + "id" + ], "onDelete": "cascade", "onUpdate": "no action" } @@ -74,7 +76,10 @@ "compositePrimaryKeys": { "deck_terms_deck_id_term_id_pk": { "name": "deck_terms_deck_id_term_id_pk", - "columns": ["deck_id", "term_id"] + "columns": [ + "deck_id", + "term_id" + ] } }, "uniqueConstraints": {}, @@ -105,17 +110,18 @@ "primaryKey": false, "notNull": false }, - "language_pair_id": { - "name": "language_pair_id", - "type": "uuid", + "source_language": { + "name": "source_language", + "type": "varchar(10)", "primaryKey": false, "notNull": true }, - "created_by": { - "name": "created_by", - "type": "uuid", + "validated_languages": { + "name": "validated_languages", + "type": "varchar(10)[]", "primaryKey": false, - "notNull": true + "notNull": true, + "default": "'{}'" }, "is_public": { "name": "is_public", @@ -132,68 +138,34 @@ "default": "now()" } }, - "indexes": { - "idx_decks_created_by": { - "name": "idx_decks_created_by", - "columns": [ - { - "expression": "created_by", - "isExpression": false, - "asc": true, - "nulls": "last" - } - ], - "isUnique": false, - "concurrently": false, - "method": "btree", - "with": {} - }, - "idx_decks_language_pair": { - "name": "idx_decks_language_pair", - "columns": [ - { - "expression": "language_pair_id", - "isExpression": false, - "asc": true, - "nulls": "last" - } - ], - "isUnique": false, - "concurrently": false, - "method": "btree", - "with": {} - } - }, - "foreignKeys": { - "decks_language_pair_id_language_pairs_id_fk": { - "name": "decks_language_pair_id_language_pairs_id_fk", - "tableFrom": "decks", - "tableTo": "language_pairs", - "columnsFrom": ["language_pair_id"], - "columnsTo": ["id"], - "onDelete": "cascade", - "onUpdate": "no action" - }, - "decks_created_by_users_id_fk": { - "name": "decks_created_by_users_id_fk", - "tableFrom": "decks", - "tableTo": "users", - "columnsFrom": ["created_by"], - "columnsTo": ["id"], - "onDelete": "cascade", - "onUpdate": "no action" - } - }, + "indexes": {}, + "foreignKeys": {}, "compositePrimaryKeys": {}, "uniqueConstraints": { "unique_deck_name": { "name": "unique_deck_name", "nullsNotDistinct": false, - "columns": ["name", "created_by"] + "columns": [ + "name", + "source_language" + ] } }, "policies": {}, - "checkConstraints": {}, + "checkConstraints": { + "source_language_check": { + "name": "source_language_check", + "value": "\"decks\".\"source_language\" IN ('en', 'it')" + }, + "validated_languages_check": { + "name": "validated_languages_check", + "value": "validated_languages <@ ARRAY['en', 'it']::varchar[]" + }, + "validated_languages_excludes_source": { + "name": "validated_languages_excludes_source", + "value": "NOT (\"decks\".\"source_language\" = ANY(\"decks\".\"validated_languages\"))" + } + }, "isRLSEnabled": false }, "public.language_pairs": { @@ -275,7 +247,10 @@ "unique_source_target": { "name": "unique_source_target", "nullsNotDistinct": false, - "columns": ["source_language", "target_language"] + "columns": [ + "source_language", + "target_language" + ] } }, "policies": {}, @@ -354,8 +329,12 @@ "name": "term_glosses_term_id_terms_id_fk", "tableFrom": "term_glosses", "tableTo": "terms", - "columnsFrom": ["term_id"], - "columnsTo": ["id"], + "columnsFrom": [ + "term_id" + ], + "columnsTo": [ + "id" + ], "onDelete": "cascade", "onUpdate": "no action" } @@ -365,7 +344,11 @@ "unique_term_gloss": { "name": "unique_term_gloss", "nullsNotDistinct": false, - "columns": ["term_id", "language_code", "text"] + "columns": [ + "term_id", + "language_code", + "text" + ] } }, "policies": {}, @@ -426,7 +409,9 @@ "terms_synset_id_unique": { "name": "terms_synset_id_unique", "nullsNotDistinct": false, - "columns": ["synset_id"] + "columns": [ + "synset_id" + ] } }, "policies": {}, @@ -503,8 +488,12 @@ "name": "translations_term_id_terms_id_fk", "tableFrom": "translations", "tableTo": "terms", - "columnsFrom": ["term_id"], - "columnsTo": ["id"], + "columnsFrom": [ + "term_id" + ], + "columnsTo": [ + "id" + ], "onDelete": "cascade", "onUpdate": "no action" } @@ -514,7 +503,11 @@ "unique_translations": { "name": "unique_translations", "nullsNotDistinct": false, - "columns": ["term_id", "language_code", "text"] + "columns": [ + "term_id", + "language_code", + "text" + ] } }, "policies": {}, @@ -571,17 +564,23 @@ "users_openauth_sub_unique": { "name": "users_openauth_sub_unique", "nullsNotDistinct": false, - "columns": ["openauth_sub"] + "columns": [ + "openauth_sub" + ] }, "users_email_unique": { "name": "users_email_unique", "nullsNotDistinct": false, - "columns": ["email"] + "columns": [ + "email" + ] }, "users_display_name_unique": { "name": "users_display_name_unique", "nullsNotDistinct": false, - "columns": ["display_name"] + "columns": [ + "display_name" + ] } }, "policies": {}, @@ -595,5 +594,9 @@ "roles": {}, "policies": {}, "views": {}, - "_meta": { "columns": {}, "schemas": {}, "tables": {} } -} + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} \ No newline at end of file diff --git a/packages/db/drizzle/meta/0001_snapshot.json b/packages/db/drizzle/meta/0001_snapshot.json deleted file mode 100644 index 41a8dbd..0000000 --- a/packages/db/drizzle/meta/0001_snapshot.json +++ /dev/null @@ -1,587 +0,0 @@ -{ - "id": "d6bed73d-ee69-44b1-a3ce-3ae25898a6f0", - "prevId": "5830ce3b-dc0e-44a7-83d6-bc74016ca4fd", - "version": "7", - "dialect": "postgresql", - "tables": { - "public.deck_terms": { - "name": "deck_terms", - "schema": "", - "columns": { - "deck_id": { - "name": "deck_id", - "type": "uuid", - "primaryKey": false, - "notNull": true - }, - "term_id": { - "name": "term_id", - "type": "uuid", - "primaryKey": false, - "notNull": true - }, - "added_at": { - "name": "added_at", - "type": "timestamp with time zone", - "primaryKey": false, - "notNull": true, - "default": "now()" - } - }, - "indexes": { - "idx_deck_terms_term": { - "name": "idx_deck_terms_term", - "columns": [ - { - "expression": "term_id", - "isExpression": false, - "asc": true, - "nulls": "last" - } - ], - "isUnique": false, - "concurrently": false, - "method": "btree", - "with": {} - } - }, - "foreignKeys": { - "deck_terms_deck_id_decks_id_fk": { - "name": "deck_terms_deck_id_decks_id_fk", - "tableFrom": "deck_terms", - "tableTo": "decks", - "columnsFrom": [ - "deck_id" - ], - "columnsTo": [ - "id" - ], - "onDelete": "cascade", - "onUpdate": "no action" - }, - "deck_terms_term_id_terms_id_fk": { - "name": "deck_terms_term_id_terms_id_fk", - "tableFrom": "deck_terms", - "tableTo": "terms", - "columnsFrom": [ - "term_id" - ], - "columnsTo": [ - "id" - ], - "onDelete": "cascade", - "onUpdate": "no action" - } - }, - "compositePrimaryKeys": { - "deck_terms_deck_id_term_id_pk": { - "name": "deck_terms_deck_id_term_id_pk", - "columns": [ - "deck_id", - "term_id" - ] - } - }, - "uniqueConstraints": {}, - "policies": {}, - "checkConstraints": {}, - "isRLSEnabled": false - }, - "public.decks": { - "name": "decks", - "schema": "", - "columns": { - "id": { - "name": "id", - "type": "uuid", - "primaryKey": true, - "notNull": true, - "default": "gen_random_uuid()" - }, - "name": { - "name": "name", - "type": "text", - "primaryKey": false, - "notNull": true - }, - "description": { - "name": "description", - "type": "text", - "primaryKey": false, - "notNull": false - }, - "validated_for_languages": { - "name": "validated_for_languages", - "type": "varchar(10)[]", - "primaryKey": false, - "notNull": true, - "default": "'{}'" - }, - "is_public": { - "name": "is_public", - "type": "boolean", - "primaryKey": false, - "notNull": true, - "default": false - }, - "created_at": { - "name": "created_at", - "type": "timestamp with time zone", - "primaryKey": false, - "notNull": true, - "default": "now()" - } - }, - "indexes": {}, - "foreignKeys": {}, - "compositePrimaryKeys": {}, - "uniqueConstraints": { - "unique_deck_name": { - "name": "unique_deck_name", - "nullsNotDistinct": false, - "columns": [ - "name" - ] - } - }, - "policies": {}, - "checkConstraints": { - "validated_languages_check": { - "name": "validated_languages_check", - "value": "validated_for_languages <@ ARRAY['en', 'it']::varchar[]" - } - }, - "isRLSEnabled": false - }, - "public.language_pairs": { - "name": "language_pairs", - "schema": "", - "columns": { - "id": { - "name": "id", - "type": "uuid", - "primaryKey": true, - "notNull": true, - "default": "gen_random_uuid()" - }, - "source_language": { - "name": "source_language", - "type": "varchar(10)", - "primaryKey": false, - "notNull": true - }, - "target_language": { - "name": "target_language", - "type": "varchar(10)", - "primaryKey": false, - "notNull": true - }, - "label": { - "name": "label", - "type": "text", - "primaryKey": false, - "notNull": false - }, - "active": { - "name": "active", - "type": "boolean", - "primaryKey": false, - "notNull": true, - "default": true - }, - "created_at": { - "name": "created_at", - "type": "timestamp with time zone", - "primaryKey": false, - "notNull": true, - "default": "now()" - } - }, - "indexes": { - "idx_pairs_active": { - "name": "idx_pairs_active", - "columns": [ - { - "expression": "active", - "isExpression": false, - "asc": true, - "nulls": "last" - }, - { - "expression": "source_language", - "isExpression": false, - "asc": true, - "nulls": "last" - }, - { - "expression": "target_language", - "isExpression": false, - "asc": true, - "nulls": "last" - } - ], - "isUnique": false, - "concurrently": false, - "method": "btree", - "with": {} - } - }, - "foreignKeys": {}, - "compositePrimaryKeys": {}, - "uniqueConstraints": { - "unique_source_target": { - "name": "unique_source_target", - "nullsNotDistinct": false, - "columns": [ - "source_language", - "target_language" - ] - } - }, - "policies": {}, - "checkConstraints": { - "source_language_check": { - "name": "source_language_check", - "value": "\"language_pairs\".\"source_language\" IN ('en', 'it')" - }, - "target_language_check": { - "name": "target_language_check", - "value": "\"language_pairs\".\"target_language\" IN ('en', 'it')" - }, - "no_self_pair": { - "name": "no_self_pair", - "value": "\"language_pairs\".\"source_language\" != \"language_pairs\".\"target_language\"" - } - }, - "isRLSEnabled": false - }, - "public.term_glosses": { - "name": "term_glosses", - "schema": "", - "columns": { - "id": { - "name": "id", - "type": "uuid", - "primaryKey": true, - "notNull": true, - "default": "gen_random_uuid()" - }, - "term_id": { - "name": "term_id", - "type": "uuid", - "primaryKey": false, - "notNull": true - }, - "language_code": { - "name": "language_code", - "type": "varchar(10)", - "primaryKey": false, - "notNull": true - }, - "text": { - "name": "text", - "type": "text", - "primaryKey": false, - "notNull": true - }, - "created_at": { - "name": "created_at", - "type": "timestamp with time zone", - "primaryKey": false, - "notNull": true, - "default": "now()" - } - }, - "indexes": { - "idx_term_glosses_term": { - "name": "idx_term_glosses_term", - "columns": [ - { - "expression": "term_id", - "isExpression": false, - "asc": true, - "nulls": "last" - } - ], - "isUnique": false, - "concurrently": false, - "method": "btree", - "with": {} - } - }, - "foreignKeys": { - "term_glosses_term_id_terms_id_fk": { - "name": "term_glosses_term_id_terms_id_fk", - "tableFrom": "term_glosses", - "tableTo": "terms", - "columnsFrom": [ - "term_id" - ], - "columnsTo": [ - "id" - ], - "onDelete": "cascade", - "onUpdate": "no action" - } - }, - "compositePrimaryKeys": {}, - "uniqueConstraints": { - "unique_term_gloss": { - "name": "unique_term_gloss", - "nullsNotDistinct": false, - "columns": [ - "term_id", - "language_code", - "text" - ] - } - }, - "policies": {}, - "checkConstraints": {}, - "isRLSEnabled": false - }, - "public.terms": { - "name": "terms", - "schema": "", - "columns": { - "id": { - "name": "id", - "type": "uuid", - "primaryKey": true, - "notNull": true, - "default": "gen_random_uuid()" - }, - "synset_id": { - "name": "synset_id", - "type": "text", - "primaryKey": false, - "notNull": true - }, - "pos": { - "name": "pos", - "type": "varchar(20)", - "primaryKey": false, - "notNull": true - }, - "created_at": { - "name": "created_at", - "type": "timestamp with time zone", - "primaryKey": false, - "notNull": true, - "default": "now()" - } - }, - "indexes": { - "idx_terms_pos": { - "name": "idx_terms_pos", - "columns": [ - { - "expression": "pos", - "isExpression": false, - "asc": true, - "nulls": "last" - } - ], - "isUnique": false, - "concurrently": false, - "method": "btree", - "with": {} - } - }, - "foreignKeys": {}, - "compositePrimaryKeys": {}, - "uniqueConstraints": { - "terms_synset_id_unique": { - "name": "terms_synset_id_unique", - "nullsNotDistinct": false, - "columns": [ - "synset_id" - ] - } - }, - "policies": {}, - "checkConstraints": { - "pos_check": { - "name": "pos_check", - "value": "\"terms\".\"pos\" IN ('noun')" - } - }, - "isRLSEnabled": false - }, - "public.translations": { - "name": "translations", - "schema": "", - "columns": { - "id": { - "name": "id", - "type": "uuid", - "primaryKey": true, - "notNull": true, - "default": "gen_random_uuid()" - }, - "term_id": { - "name": "term_id", - "type": "uuid", - "primaryKey": false, - "notNull": true - }, - "language_code": { - "name": "language_code", - "type": "varchar(10)", - "primaryKey": false, - "notNull": true - }, - "text": { - "name": "text", - "type": "text", - "primaryKey": false, - "notNull": true - }, - "created_at": { - "name": "created_at", - "type": "timestamp with time zone", - "primaryKey": false, - "notNull": true, - "default": "now()" - } - }, - "indexes": { - "idx_translations_lang": { - "name": "idx_translations_lang", - "columns": [ - { - "expression": "language_code", - "isExpression": false, - "asc": true, - "nulls": "last" - }, - { - "expression": "term_id", - "isExpression": false, - "asc": true, - "nulls": "last" - } - ], - "isUnique": false, - "concurrently": false, - "method": "btree", - "with": {} - } - }, - "foreignKeys": { - "translations_term_id_terms_id_fk": { - "name": "translations_term_id_terms_id_fk", - "tableFrom": "translations", - "tableTo": "terms", - "columnsFrom": [ - "term_id" - ], - "columnsTo": [ - "id" - ], - "onDelete": "cascade", - "onUpdate": "no action" - } - }, - "compositePrimaryKeys": {}, - "uniqueConstraints": { - "unique_translations": { - "name": "unique_translations", - "nullsNotDistinct": false, - "columns": [ - "term_id", - "language_code", - "text" - ] - } - }, - "policies": {}, - "checkConstraints": {}, - "isRLSEnabled": false - }, - "public.users": { - "name": "users", - "schema": "", - "columns": { - "id": { - "name": "id", - "type": "uuid", - "primaryKey": true, - "notNull": true, - "default": "gen_random_uuid()" - }, - "openauth_sub": { - "name": "openauth_sub", - "type": "text", - "primaryKey": false, - "notNull": true - }, - "email": { - "name": "email", - "type": "varchar(255)", - "primaryKey": false, - "notNull": false - }, - "display_name": { - "name": "display_name", - "type": "varchar(100)", - "primaryKey": false, - "notNull": false - }, - "created_at": { - "name": "created_at", - "type": "timestamp with time zone", - "primaryKey": false, - "notNull": true, - "default": "now()" - }, - "last_login_at": { - "name": "last_login_at", - "type": "timestamp with time zone", - "primaryKey": false, - "notNull": false - } - }, - "indexes": {}, - "foreignKeys": {}, - "compositePrimaryKeys": {}, - "uniqueConstraints": { - "users_openauth_sub_unique": { - "name": "users_openauth_sub_unique", - "nullsNotDistinct": false, - "columns": [ - "openauth_sub" - ] - }, - "users_email_unique": { - "name": "users_email_unique", - "nullsNotDistinct": false, - "columns": [ - "email" - ] - }, - "users_display_name_unique": { - "name": "users_display_name_unique", - "nullsNotDistinct": false, - "columns": [ - "display_name" - ] - } - }, - "policies": {}, - "checkConstraints": {}, - "isRLSEnabled": false - } - }, - "enums": {}, - "schemas": {}, - "sequences": {}, - "roles": {}, - "policies": {}, - "views": {}, - "_meta": { - "columns": {}, - "schemas": {}, - "tables": {} - } -} \ No newline at end of file diff --git a/packages/db/drizzle/meta/_journal.json b/packages/db/drizzle/meta/_journal.json index ac05b81..32b70b7 100644 --- a/packages/db/drizzle/meta/_journal.json +++ b/packages/db/drizzle/meta/_journal.json @@ -5,15 +5,8 @@ { "idx": 0, "version": "7", - "when": 1774721919883, - "tag": "0000_bitter_turbo", - "breakpoints": true - }, - { - "idx": 1, - "version": "7", - "when": 1774970553186, - "tag": "0001_medical_fabian_cortez", + "when": 1775053965903, + "tag": "0000_faithful_oracle", "breakpoints": true } ] diff --git a/packages/db/package.json b/packages/db/package.json index 3fbdc05..7194bb3 100644 --- a/packages/db/package.json +++ b/packages/db/package.json @@ -6,7 +6,9 @@ "scripts": { "build": "tsc", "generate": "drizzle-kit generate", - "migrate": "drizzle-kit migrate" + "migrate": "drizzle-kit migrate", + "db:seed": "npx tsx src/seeding-datafiles.ts", + "db:build-deck": "npx tsx src/generating-deck.ts" }, "dependencies": { "@glossa/shared": "workspace:*", diff --git a/packages/db/src/data/wordlists/top1000englishnouns-missing b/packages/db/src/data/wordlists/top1000englishnouns-missing new file mode 100644 index 0000000..08c21b2 --- /dev/null +++ b/packages/db/src/data/wordlists/top1000englishnouns-missing @@ -0,0 +1,34 @@ +a +other +us +may +st +paul +new +software +oxford +english +mary +japan +while +pp +membership +manchester +tony +alan +jones +un +northern +simon +behalf +co +graham +joe +guy +lewis +jane +taylor +co-operation +travel +self +thatcher \ No newline at end of file diff --git a/packages/db/src/db/schema.ts b/packages/db/src/db/schema.ts index 90604af..0d15015 100644 --- a/packages/db/src/db/schema.ts +++ b/packages/db/src/db/schema.ts @@ -144,11 +144,11 @@ export const decks = pgTable( ), check( "validated_languages_check", - sql`validated_for_languages <@ ARRAY[${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))}]::varchar[]`, + sql`validated_languages <@ ARRAY[${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))}]::varchar[]`, ), check( "validated_languages_excludes_source", - sql`NOT (${table.source_language} = ANY(validated_for_languages))`, + sql`NOT (${table.source_language} = ANY(${table.validated_languages}))`, ), unique("unique_deck_name").on(table.name, table.source_language), ], diff --git a/packages/db/src/generating-deck.ts b/packages/db/src/generating-deck.ts new file mode 100644 index 0000000..cc43eb5 --- /dev/null +++ b/packages/db/src/generating-deck.ts @@ -0,0 +1,302 @@ +/* + * + * Builds the "top English nouns" deck from a curated wordlist of the 1000 most + * frequently used English nouns. The deck has English as its source language — + * meaning it was curated from an English-centric frequency list, and a separate + * deck would be needed for other source languages. For each word in the list, all + * matching term IDs are looked up in the database via the translations table + * (language: "en", POS: "noun") — homonyms are intentionally included as separate + * cards since the quiz UI displays a gloss alongside each word. Words from the + * list that have no DB match are skipped and written to a file for future + * reference. The script is idempotent: if the deck already exists, only terms + * present in the source but missing from the deck are added; terms already in the + * deck are left untouched; terms in the deck but absent from the source are never + * removed. After resolving all matched terms, the script determines + * validated_for_languages by checking which languages — excluding the source + * language — have full translation coverage across all matched terms, and updates + * the array on every run. + */ + +/* + * roadmap + * + * [x] Setup - hardcoded path, name, description, source language, POS + * [x] Read wordlist - load and deduplicate the 1000 nouns + * [x] Query terms - match to database, collect all term IDs per word (including homonyms) + * [x] Write missing words to file for future reference + * [x] Determine validated_languages - find languages (excluding source) with full coverage across all matched terms + * [x] Check idempotency - if deck exists, diff matched terms against existing deck_terms + * [x] Create deck if it doesn't exist - insert with name, source_language, validated_languages + * [x] Add new terms - insert only term IDs present in source but missing from deck + * [x] Update validated_languages - recalculate and update on every run + * [x] Report - summary of words found, missing, added, and validated languages + */ + +import fs from "node:fs/promises"; +import { db } from "@glossa/db"; +import { translations, terms, decks, deck_terms } from "@glossa/db/schema"; +import { inArray, and, eq } from "drizzle-orm"; +import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared"; + +const pathToWordlist = "./src/data/wordlists/top1000englishnouns"; +const nameOfDeck = "top english nouns"; +const descriptionOfDeck = + "Most frequently used English nouns for vocabulary practice"; +const sourceLanguage = "en"; +const sourcePOS = "noun"; + +// new Set() automatically discards duplicate values, +// and spreading it back with ... converts it to a plain array again. +// So if "bank" appears twice in the file, +// the resulting array will only contain it once. +const readingFromWordlist = async () => { + const raw = await fs.readFile(pathToWordlist, "utf8"); + const words = [ + ...new Set( + raw + .split("\n") + .map((w) => w.trim().toLowerCase()) + .filter(Boolean), + ), + ]; + return words; +}; + +const checkingSourceWordsAgainstDB = async (words: string[]) => { + const rows = await db + .select({ text: translations.text, termId: translations.term_id }) + .from(translations) + .innerJoin(terms, eq(translations.term_id, terms.id)) + .where( + and( + inArray(translations.text, words), + eq(translations.language_code, sourceLanguage), + eq(terms.pos, sourcePOS), + ), + ); + + const wordToTermIds = new Map(); + for (const row of rows) { + const word = row.text.toLowerCase(); + const existing = wordToTermIds.get(word) ?? []; + wordToTermIds.set(word, [...existing, row.termId]); + } + const termIds = [...new Set(Array.from(wordToTermIds.values()).flat())]; + const missingWords = words.filter((w) => !wordToTermIds.has(w)); + + return { termIds, missingWords }; +}; + +const writeMissingWordsToFile = async (missingWords: string[]) => { + const outputPath = `${pathToWordlist}-missing`; + await fs.writeFile(outputPath, missingWords.join("\n"), "utf8"); +}; + +const validateLanguages = async (sourceLanguage: string, termIds: string[]) => { + // create array of language code from the supported languages + // remove source language from it + const languages = SUPPORTED_LANGUAGE_CODES.filter( + (language) => language !== sourceLanguage, + ); + const validatedLanguages: string[] = []; + // For each remaining language, count how many of the termIds have a translation in that language + for (const language of languages) { + const rows = await db + .selectDistinct({ termId: translations.term_id }) + .from(translations) + .where( + and( + inArray(translations.term_id, termIds), + eq(translations.language_code, language), + ), + ); + if (rows.length === termIds.length) { + validatedLanguages.push(language); + } + } + + // If the count equals termIds.length → full coverage → include in result + // Return the array of fully covered languages + return validatedLanguages; +}; + +// Check idempotency — if deck exists, diff matched terms against existing deck_terms +const findExistingDeck = async () => { + const existing = await db + .select({ id: decks.id, validatedForLanguages: decks.validated_languages }) + .from(decks) + .where( + and( + eq(decks.name, nameOfDeck), + eq(decks.source_language, sourceLanguage), + ), + ); + return existing[0] ?? null; +}; + +// logging translation coverage per language across all matched terms +const logLanguageCoverage = async (termIds: string[]) => { + const languages = SUPPORTED_LANGUAGE_CODES.filter( + (language) => language !== sourceLanguage, + ); + for (const language of languages) { + const rows = await db + .selectDistinct({ termId: translations.term_id }) + .from(translations) + .where( + and( + inArray(translations.term_id, termIds), + eq(translations.language_code, language), + ), + ); + console.log( + ` ${language}: ${rows.length} / ${termIds.length} terms covered`, + ); + + const coveredIds = new Set(rows.map((r) => r.termId)); + const missingTermIds = termIds.filter((id) => !coveredIds.has(id)); + console.log(` missing term IDs count:`, missingTermIds.length); + + const missingEnglish = await db + .selectDistinct({ text: translations.text }) + .from(translations) + .where( + and( + inArray(translations.term_id, missingTermIds), + eq(translations.language_code, "en"), + ), + ); + console.log( + ` missing words in ${language}:`, + missingEnglish.map((r) => r.text), + "\n", + ); + } +}; + +// creating a deck +const createDeck = async (validatedLanguages: string[]) => { + const result = await db + .insert(decks) + .values({ + name: nameOfDeck, + description: descriptionOfDeck, + source_language: sourceLanguage, + validated_languages: validatedLanguages, + is_public: false, + }) + .returning({ id: decks.id }); + const created = result[0]; + if (!created) throw new Error("Failed to create deck: no row returned"); + return created.id; +}; + +// Diffs termIds against the existing deck_terms for this deck and inserts only +// the ones not already present. Returns the count of newly inserted terms. +const addTermsToDeck = async ( + deckId: string, + termIds: string[], +): Promise => { + const existingRows = await db + .select({ termId: deck_terms.term_id }) + .from(deck_terms) + .where(eq(deck_terms.deck_id, deckId)); + + const existingTermIds = new Set(existingRows.map((r) => r.termId)); + const newTermIds = termIds.filter((id) => !existingTermIds.has(id)); + + if (newTermIds.length === 0) return 0; + + await db + .insert(deck_terms) + .values(newTermIds.map((termId) => ({ deck_id: deckId, term_id: termId }))); + + return newTermIds.length; +}; + +// Recalculates and persists validated_languages on every run so the field stays +// accurate as translation coverage grows over time. +const updateValidatedLanguages = async ( + deckId: string, + validatedLanguages: string[], +): Promise => { + await db + .update(decks) + .set({ validated_languages: validatedLanguages }) + .where(eq(decks.id, deckId)); +}; + +const main = async () => { + console.log("📖 Reading word list..."); + const sourceWords = await readingFromWordlist(); + console.log(` ${sourceWords.length} words loaded\n`); + + console.log("🔍 Checking against database..."); + const { termIds, missingWords } = + await checkingSourceWordsAgainstDB(sourceWords); + console.log(` ${termIds.length} terms found`); + console.log(` ${missingWords.length} words not found in DB\n`); + + console.log("🖊️ Writing missing words to file...\n"); + await writeMissingWordsToFile(missingWords); + + console.log("✅ Validating languages..."); + const validatedLanguages = await validateLanguages(sourceLanguage, termIds); + console.log( + ` Validated languages: ${JSON.stringify(validatedLanguages)}\n`, + ); + + console.log("🔬 Language coverage breakdown..."); + await logLanguageCoverage(termIds); + + console.log("🃏 Looking for existing deck..."); + const existingDeck = await findExistingDeck(); + + let deckId: string; + let isNewDeck: boolean; + + if (!existingDeck) { + console.log(" No existing deck found, will create one\n"); + console.log("🆕 Creating deck..."); + deckId = await createDeck(validatedLanguages); + console.log(` Deck created with id: ${deckId}\n`); + isNewDeck = true; + } else { + console.log(` Found existing deck with id: ${existingDeck.id}\n`); + deckId = existingDeck.id; + isNewDeck = false; + } + + console.log("➕ Adding terms to deck..."); + const addedCount = await addTermsToDeck(deckId, termIds); + const alreadyPresentCount = termIds.length - addedCount; + console.log(` ${addedCount} terms added`); + console.log(` ${alreadyPresentCount} terms already in deck\n`); + + if (!isNewDeck) { + console.log("🔄 Updating validated languages..."); + await updateValidatedLanguages(deckId, validatedLanguages); + console.log(` Updated to: ${JSON.stringify(validatedLanguages)}\n`); + } + + console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + console.log("📊 Summary"); + console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + console.log(` Words loaded from wordlist : ${sourceWords.length}`); + console.log( + ` Words matched in DB : ${sourceWords.length - missingWords.length}`, + ); + console.log(` Words not found in DB : ${missingWords.length}`); + console.log(` Term IDs resolved : ${termIds.length}`); + console.log(` Terms added to deck : ${addedCount}`); + console.log(` Terms already in deck : ${alreadyPresentCount}`); + console.log( + ` Validated languages : ${validatedLanguages.length > 0 ? validatedLanguages.join(", ") : "none"}`, + ); + console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); +}; + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/packages/db/src/generating-decks.ts b/packages/db/src/generating-decks.ts deleted file mode 100644 index 42e14dc..0000000 --- a/packages/db/src/generating-decks.ts +++ /dev/null @@ -1,162 +0,0 @@ -/* - * - * Builds the "top English nouns" deck from a curated wordlist of the 1000 most - * frequently used English nouns. The deck has English as its source language — - * meaning it was curated from an English-centric frequency list, and a separate - * deck would be needed for other source languages. For each word in the list, all - * matching term IDs are looked up in the database via the translations table - * (language: "en", POS: "noun") — homonyms are intentionally included as separate - * cards since the quiz UI displays a gloss alongside each word. Words from the - * list that have no DB match are skipped and written to a file for future - * reference. The script is idempotent: if the deck already exists, only terms - * present in the source but missing from the deck are added; terms already in the - * deck are left untouched; terms in the deck but absent from the source are never - * removed. After resolving all matched terms, the script determines - * validated_for_languages by checking which languages — excluding the source - * language — have full translation coverage across all matched terms, and updates - * the array on every run. - */ - -/* - * roadmap - * - * [x] Setup — hardcoded path, name, description, source language, POS - * [x] Read wordlist — load and deduplicate the 1000 nouns - * [x] Query terms — match to database, collect all term IDs per word (including homonyms) - * [x] Write missing words to file for future reference - * [x] Determine validated_for_languages — find languages (excluding source) with full coverage across all matched terms - * [ ] Check idempotency — if deck exists, diff matched terms against existing deck_terms - * [ ] Create deck if it doesn't exist — insert with name, source_language, validated_for_languages - * [ ] Add new terms — insert only term IDs present in source but missing from deck - * [ ] Update validated_for_languages — recalculate and update on every run - * [ ] Report — summary of words found, missing, added, and validated languages - */ - -import fs from "node:fs/promises"; -import { db } from "@glossa/db"; -import { translations, terms, decks } from "@glossa/db/schema"; -import { inArray, and, eq } from "drizzle-orm"; -import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared"; - -const pathToWordlist = "./src/data/wordlists/top1000englishnouns"; -const nameOfDeck = "top english nouns"; -const descriptionOfDeck = - "Most frequently used English nouns for vocabulary practice"; -const sourceLanguage = "en"; -const sourcePOS = "noun"; - -// new Set() automatically discards duplicate values, -// and spreading it back with ... converts it to a plain array again. -// So if "bank" appears twice in the file, -// the resulting array will only contain it once. -const readingFromWordlist = async () => { - const raw = await fs.readFile(pathToWordlist, "utf8"); - const words = [ - ...new Set( - raw - .split("\n") - .map((w) => w.trim().toLowerCase()) - .filter(Boolean), - ), - ]; - return words; -}; - -const checkingSourceWordsAgainstDB = async (words: string[]) => { - const rows = await db - .select({ text: translations.text, termId: translations.term_id }) - .from(translations) - .innerJoin(terms, eq(translations.term_id, terms.id)) - .where( - and( - inArray(translations.text, words), - eq(translations.language_code, sourceLanguage), - eq(terms.pos, sourcePOS), - ), - ); - - const wordToTermIds = new Map(); - for (const row of rows) { - const word = row.text.toLowerCase(); - const existing = wordToTermIds.get(word) ?? []; - wordToTermIds.set(word, [...existing, row.termId]); - } - const termIds = Array.from(wordToTermIds.values()).flat(); - const missingWords = words.filter((w) => !wordToTermIds.has(w)); - - return { termIds, missingWords }; -}; - -const writeMissingWordsToFile = async (missingWords: string[]) => { - const outputPath = `${pathToWordlist}-missing`; - await fs.writeFile(outputPath, missingWords.join("\n"), "utf8"); -}; - -const validateLanguages = async (sourceLanguage: string, termIds: string[]) => { - // create array of language code from the supported languages - // remove source language from it - const languages = SUPPORTED_LANGUAGE_CODES.filter( - (language) => language !== sourceLanguage, - ); - const validatedLanguages: string[] = []; - // For each remaining language, count how many of the termIds have a translation in that language - for (const language of languages) { - const rows = await db - .select({ termId: translations.term_id }) - .from(translations) - .where( - and( - inArray(translations.term_id, termIds), - eq(translations.language_code, language), - ), - ); - if (rows.length === termIds.length) { - validatedLanguages.push(language); - } - } - - // If the count equals termIds.length → full coverage → include in result - // Return the array of fully covered languages - return validatedLanguages; -}; - -const findExistingDeck = async () => { - const existing = await db - .select({ id: decks.id, validatedForLanguages: decks.validated_languages }) - .from(decks) - .where( - and( - eq(decks.name, nameOfDeck), - eq(decks.source_language, sourceLanguage), - ), - ); - return existing[0] ?? null; -}; - -const main = async () => { - // reading from source file - console.log("📖 Reading word list..."); - const sourceWords = await readingFromWordlist(); - console.log(` ${sourceWords.length} words loaded\n`); - - // checking if sourceWords exist in database - console.log("🔍 Checking against database..."); - const { termIds, missingWords } = - await checkingSourceWordsAgainstDB(sourceWords); - console.log("words found in db: ", termIds.length); - console.log("words NOT found in db: ", missingWords.length, "\n"); - - // writing missing words to file - console.log("writing missing words to file...\n"); - await writeMissingWordsToFile(missingWords); - - // validating languages - console.log("validation languages..."); - const validatedLanguages = await validateLanguages(sourceLanguage, termIds); - console.log("validated these languages: ", validatedLanguages, "\n"); -}; - -main().catch((error) => { - console.error(error); - process.exit(1); -});