feat(db): complete deck generation script for top english nouns
- add deck_terms to schema imports - add addTermsToDeck — diffs source term IDs against existing deck_terms, inserts only new ones, returns count of inserted terms - add updateValidatedLanguages — recalculates and persists validated_languages on every run so coverage stays accurate as translation data grows - wire both functions into main with isNewDeck guard to avoid redundant validated_languages update on deck creation - add final summary report - fix possible undefined on result[0] in createDeck - tick off remaining roadmap items
This commit is contained in:
parent
7fdcedd1dd
commit
3bb8bfdb39
12 changed files with 442 additions and 875 deletions
|
|
@ -4,8 +4,6 @@
|
|||
|
||||
- pinning dependencies in package.json files
|
||||
- rethink organisation of datafiles and wordlists
|
||||
- add this to drizzle migrations file:
|
||||
✅ ALTER TABLE terms ADD CHECK (pos IN ('noun', 'verb', 'adjective', etc));
|
||||
|
||||
## openwordnet
|
||||
|
||||
|
|
|
|||
|
|
@ -29,10 +29,9 @@ Done when: `GET /api/decks/1/terms?limit=10` returns 10 terms from a specific de
|
|||
[x] Write and run migration (includes CHECK constraints for `pos`, `gloss_type`)
|
||||
[x] Write `packages/db/src/seed.ts` (imports ALL terms + translations, NO decks)
|
||||
[x] Download CEFR A1/A2 noun lists (from GitHub repos)
|
||||
[ ] Write `scripts/build_decks.ts` (reads external CEFR lists, matches to DB, creates decks)
|
||||
[ ] check notes.md
|
||||
[ ] Run `pnpm db:seed` → populates terms
|
||||
[ ] Run `pnpm db:build-decks` → creates curated decks
|
||||
[x] Write `scripts/build_decks.ts` (reads external CEFR lists, matches to DB, creates decks)
|
||||
[x] Run `pnpm db:seed` → populates terms
|
||||
[x] Run `pnpm db:build-deck` → creates curated decks
|
||||
[ ] Define Zod response schemas in `packages/shared`
|
||||
[ ] Implement `DeckRepository.getTerms(deckId, limit, offset)`
|
||||
[ ] Implement `QuizService.attachDistractors(terms)` — same POS, server-side, no duplicates
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
CREATE TABLE "deck_terms" (
|
||||
"deck_id" uuid NOT NULL,
|
||||
"term_id" uuid NOT NULL,
|
||||
"position" integer NOT NULL,
|
||||
"added_at" timestamp with time zone DEFAULT now() NOT NULL,
|
||||
CONSTRAINT "deck_terms_deck_id_term_id_pk" PRIMARY KEY("deck_id","term_id")
|
||||
);
|
||||
|
|
@ -10,11 +9,14 @@ CREATE TABLE "decks" (
|
|||
"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
|
||||
"name" text NOT NULL,
|
||||
"description" text,
|
||||
"language_pair_id" uuid NOT NULL,
|
||||
"created_by" uuid NOT NULL,
|
||||
"source_language" varchar(10) NOT NULL,
|
||||
"validated_languages" varchar(10)[] DEFAULT '{}' NOT NULL,
|
||||
"is_public" boolean DEFAULT false NOT NULL,
|
||||
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
|
||||
CONSTRAINT "unique_deck_name" UNIQUE("name","created_by")
|
||||
CONSTRAINT "unique_deck_name" UNIQUE("name","source_language"),
|
||||
CONSTRAINT "source_language_check" CHECK ("decks"."source_language" IN ('en', 'it')),
|
||||
CONSTRAINT "validated_languages_check" CHECK (validated_languages <@ ARRAY['en', 'it']::varchar[]),
|
||||
CONSTRAINT "validated_languages_excludes_source" CHECK (NOT ("decks"."source_language" = ANY("decks"."validated_languages")))
|
||||
);
|
||||
--> statement-breakpoint
|
||||
CREATE TABLE "language_pairs" (
|
||||
|
|
@ -71,14 +73,10 @@ CREATE TABLE "users" (
|
|||
--> statement-breakpoint
|
||||
ALTER TABLE "deck_terms" ADD CONSTRAINT "deck_terms_deck_id_decks_id_fk" FOREIGN KEY ("deck_id") REFERENCES "public"."decks"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
ALTER TABLE "deck_terms" ADD CONSTRAINT "deck_terms_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
ALTER TABLE "decks" ADD CONSTRAINT "decks_language_pair_id_language_pairs_id_fk" FOREIGN KEY ("language_pair_id") REFERENCES "public"."language_pairs"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
ALTER TABLE "decks" ADD CONSTRAINT "decks_created_by_users_id_fk" FOREIGN KEY ("created_by") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
ALTER TABLE "term_glosses" ADD CONSTRAINT "term_glosses_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
ALTER TABLE "translations" ADD CONSTRAINT "translations_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
CREATE INDEX "idx_deck_terms_term" ON "deck_terms" USING btree ("term_id");--> statement-breakpoint
|
||||
CREATE INDEX "idx_decks_created_by" ON "decks" USING btree ("created_by");--> statement-breakpoint
|
||||
CREATE INDEX "idx_decks_language_pair" ON "decks" USING btree ("language_pair_id");--> statement-breakpoint
|
||||
CREATE INDEX "idx_pairs_active" ON "language_pairs" USING btree ("active","source_language","target_language");--> statement-breakpoint
|
||||
CREATE INDEX "idx_term_glosses_term" ON "term_glosses" USING btree ("term_id");--> statement-breakpoint
|
||||
CREATE INDEX "idx_terms_pos" ON "terms" USING btree ("pos");--> statement-breakpoint
|
||||
CREATE INDEX "idx_translations_lang" ON "translations" USING btree ("language_code","term_id");
|
||||
CREATE INDEX "idx_translations_lang" ON "translations" USING btree ("language_code","term_id");
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
ALTER TABLE "decks" DROP CONSTRAINT "unique_deck_name";--> statement-breakpoint
|
||||
ALTER TABLE "decks" DROP CONSTRAINT "decks_language_pair_id_language_pairs_id_fk";
|
||||
--> statement-breakpoint
|
||||
ALTER TABLE "decks" DROP CONSTRAINT "decks_created_by_users_id_fk";
|
||||
--> statement-breakpoint
|
||||
DROP INDEX "idx_decks_created_by";--> statement-breakpoint
|
||||
DROP INDEX "idx_decks_language_pair";--> statement-breakpoint
|
||||
ALTER TABLE "decks" ADD COLUMN "validated_for_languages" varchar(10)[] DEFAULT '{}' NOT NULL;--> statement-breakpoint
|
||||
ALTER TABLE "deck_terms" DROP COLUMN "position";--> statement-breakpoint
|
||||
ALTER TABLE "decks" DROP COLUMN "language_pair_id";--> statement-breakpoint
|
||||
ALTER TABLE "decks" DROP COLUMN "created_by";--> statement-breakpoint
|
||||
ALTER TABLE "decks" ADD CONSTRAINT "unique_deck_name" UNIQUE("name");--> statement-breakpoint
|
||||
ALTER TABLE "decks" ADD CONSTRAINT "validated_languages_check" CHECK (validated_for_languages <@ ARRAY['en', 'it']::varchar[]);
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"id": "5830ce3b-dc0e-44a7-83d6-bc74016ca4fd",
|
||||
"id": "9ef7c86d-9e64-42d6-9731-2c1794ab063e",
|
||||
"prevId": "00000000-0000-0000-0000-000000000000",
|
||||
"version": "7",
|
||||
"dialect": "postgresql",
|
||||
|
|
@ -20,12 +20,6 @@
|
|||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"position": {
|
||||
"name": "position",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"added_at": {
|
||||
"name": "added_at",
|
||||
"type": "timestamp with time zone",
|
||||
|
|
@ -56,8 +50,12 @@
|
|||
"name": "deck_terms_deck_id_decks_id_fk",
|
||||
"tableFrom": "deck_terms",
|
||||
"tableTo": "decks",
|
||||
"columnsFrom": ["deck_id"],
|
||||
"columnsTo": ["id"],
|
||||
"columnsFrom": [
|
||||
"deck_id"
|
||||
],
|
||||
"columnsTo": [
|
||||
"id"
|
||||
],
|
||||
"onDelete": "cascade",
|
||||
"onUpdate": "no action"
|
||||
},
|
||||
|
|
@ -65,8 +63,12 @@
|
|||
"name": "deck_terms_term_id_terms_id_fk",
|
||||
"tableFrom": "deck_terms",
|
||||
"tableTo": "terms",
|
||||
"columnsFrom": ["term_id"],
|
||||
"columnsTo": ["id"],
|
||||
"columnsFrom": [
|
||||
"term_id"
|
||||
],
|
||||
"columnsTo": [
|
||||
"id"
|
||||
],
|
||||
"onDelete": "cascade",
|
||||
"onUpdate": "no action"
|
||||
}
|
||||
|
|
@ -74,7 +76,10 @@
|
|||
"compositePrimaryKeys": {
|
||||
"deck_terms_deck_id_term_id_pk": {
|
||||
"name": "deck_terms_deck_id_term_id_pk",
|
||||
"columns": ["deck_id", "term_id"]
|
||||
"columns": [
|
||||
"deck_id",
|
||||
"term_id"
|
||||
]
|
||||
}
|
||||
},
|
||||
"uniqueConstraints": {},
|
||||
|
|
@ -105,17 +110,18 @@
|
|||
"primaryKey": false,
|
||||
"notNull": false
|
||||
},
|
||||
"language_pair_id": {
|
||||
"name": "language_pair_id",
|
||||
"type": "uuid",
|
||||
"source_language": {
|
||||
"name": "source_language",
|
||||
"type": "varchar(10)",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"created_by": {
|
||||
"name": "created_by",
|
||||
"type": "uuid",
|
||||
"validated_languages": {
|
||||
"name": "validated_languages",
|
||||
"type": "varchar(10)[]",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
"notNull": true,
|
||||
"default": "'{}'"
|
||||
},
|
||||
"is_public": {
|
||||
"name": "is_public",
|
||||
|
|
@ -132,68 +138,34 @@
|
|||
"default": "now()"
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"idx_decks_created_by": {
|
||||
"name": "idx_decks_created_by",
|
||||
"columns": [
|
||||
{
|
||||
"expression": "created_by",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
}
|
||||
],
|
||||
"isUnique": false,
|
||||
"concurrently": false,
|
||||
"method": "btree",
|
||||
"with": {}
|
||||
},
|
||||
"idx_decks_language_pair": {
|
||||
"name": "idx_decks_language_pair",
|
||||
"columns": [
|
||||
{
|
||||
"expression": "language_pair_id",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
}
|
||||
],
|
||||
"isUnique": false,
|
||||
"concurrently": false,
|
||||
"method": "btree",
|
||||
"with": {}
|
||||
}
|
||||
},
|
||||
"foreignKeys": {
|
||||
"decks_language_pair_id_language_pairs_id_fk": {
|
||||
"name": "decks_language_pair_id_language_pairs_id_fk",
|
||||
"tableFrom": "decks",
|
||||
"tableTo": "language_pairs",
|
||||
"columnsFrom": ["language_pair_id"],
|
||||
"columnsTo": ["id"],
|
||||
"onDelete": "cascade",
|
||||
"onUpdate": "no action"
|
||||
},
|
||||
"decks_created_by_users_id_fk": {
|
||||
"name": "decks_created_by_users_id_fk",
|
||||
"tableFrom": "decks",
|
||||
"tableTo": "users",
|
||||
"columnsFrom": ["created_by"],
|
||||
"columnsTo": ["id"],
|
||||
"onDelete": "cascade",
|
||||
"onUpdate": "no action"
|
||||
}
|
||||
},
|
||||
"indexes": {},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {
|
||||
"unique_deck_name": {
|
||||
"name": "unique_deck_name",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": ["name", "created_by"]
|
||||
"columns": [
|
||||
"name",
|
||||
"source_language"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
"checkConstraints": {},
|
||||
"checkConstraints": {
|
||||
"source_language_check": {
|
||||
"name": "source_language_check",
|
||||
"value": "\"decks\".\"source_language\" IN ('en', 'it')"
|
||||
},
|
||||
"validated_languages_check": {
|
||||
"name": "validated_languages_check",
|
||||
"value": "validated_languages <@ ARRAY['en', 'it']::varchar[]"
|
||||
},
|
||||
"validated_languages_excludes_source": {
|
||||
"name": "validated_languages_excludes_source",
|
||||
"value": "NOT (\"decks\".\"source_language\" = ANY(\"decks\".\"validated_languages\"))"
|
||||
}
|
||||
},
|
||||
"isRLSEnabled": false
|
||||
},
|
||||
"public.language_pairs": {
|
||||
|
|
@ -275,7 +247,10 @@
|
|||
"unique_source_target": {
|
||||
"name": "unique_source_target",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": ["source_language", "target_language"]
|
||||
"columns": [
|
||||
"source_language",
|
||||
"target_language"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
|
|
@ -354,8 +329,12 @@
|
|||
"name": "term_glosses_term_id_terms_id_fk",
|
||||
"tableFrom": "term_glosses",
|
||||
"tableTo": "terms",
|
||||
"columnsFrom": ["term_id"],
|
||||
"columnsTo": ["id"],
|
||||
"columnsFrom": [
|
||||
"term_id"
|
||||
],
|
||||
"columnsTo": [
|
||||
"id"
|
||||
],
|
||||
"onDelete": "cascade",
|
||||
"onUpdate": "no action"
|
||||
}
|
||||
|
|
@ -365,7 +344,11 @@
|
|||
"unique_term_gloss": {
|
||||
"name": "unique_term_gloss",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": ["term_id", "language_code", "text"]
|
||||
"columns": [
|
||||
"term_id",
|
||||
"language_code",
|
||||
"text"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
|
|
@ -426,7 +409,9 @@
|
|||
"terms_synset_id_unique": {
|
||||
"name": "terms_synset_id_unique",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": ["synset_id"]
|
||||
"columns": [
|
||||
"synset_id"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
|
|
@ -503,8 +488,12 @@
|
|||
"name": "translations_term_id_terms_id_fk",
|
||||
"tableFrom": "translations",
|
||||
"tableTo": "terms",
|
||||
"columnsFrom": ["term_id"],
|
||||
"columnsTo": ["id"],
|
||||
"columnsFrom": [
|
||||
"term_id"
|
||||
],
|
||||
"columnsTo": [
|
||||
"id"
|
||||
],
|
||||
"onDelete": "cascade",
|
||||
"onUpdate": "no action"
|
||||
}
|
||||
|
|
@ -514,7 +503,11 @@
|
|||
"unique_translations": {
|
||||
"name": "unique_translations",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": ["term_id", "language_code", "text"]
|
||||
"columns": [
|
||||
"term_id",
|
||||
"language_code",
|
||||
"text"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
|
|
@ -571,17 +564,23 @@
|
|||
"users_openauth_sub_unique": {
|
||||
"name": "users_openauth_sub_unique",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": ["openauth_sub"]
|
||||
"columns": [
|
||||
"openauth_sub"
|
||||
]
|
||||
},
|
||||
"users_email_unique": {
|
||||
"name": "users_email_unique",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": ["email"]
|
||||
"columns": [
|
||||
"email"
|
||||
]
|
||||
},
|
||||
"users_display_name_unique": {
|
||||
"name": "users_display_name_unique",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": ["display_name"]
|
||||
"columns": [
|
||||
"display_name"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
|
|
@ -595,5 +594,9 @@
|
|||
"roles": {},
|
||||
"policies": {},
|
||||
"views": {},
|
||||
"_meta": { "columns": {}, "schemas": {}, "tables": {} }
|
||||
}
|
||||
"_meta": {
|
||||
"columns": {},
|
||||
"schemas": {},
|
||||
"tables": {}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,587 +0,0 @@
|
|||
{
|
||||
"id": "d6bed73d-ee69-44b1-a3ce-3ae25898a6f0",
|
||||
"prevId": "5830ce3b-dc0e-44a7-83d6-bc74016ca4fd",
|
||||
"version": "7",
|
||||
"dialect": "postgresql",
|
||||
"tables": {
|
||||
"public.deck_terms": {
|
||||
"name": "deck_terms",
|
||||
"schema": "",
|
||||
"columns": {
|
||||
"deck_id": {
|
||||
"name": "deck_id",
|
||||
"type": "uuid",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"term_id": {
|
||||
"name": "term_id",
|
||||
"type": "uuid",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"added_at": {
|
||||
"name": "added_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "now()"
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"idx_deck_terms_term": {
|
||||
"name": "idx_deck_terms_term",
|
||||
"columns": [
|
||||
{
|
||||
"expression": "term_id",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
}
|
||||
],
|
||||
"isUnique": false,
|
||||
"concurrently": false,
|
||||
"method": "btree",
|
||||
"with": {}
|
||||
}
|
||||
},
|
||||
"foreignKeys": {
|
||||
"deck_terms_deck_id_decks_id_fk": {
|
||||
"name": "deck_terms_deck_id_decks_id_fk",
|
||||
"tableFrom": "deck_terms",
|
||||
"tableTo": "decks",
|
||||
"columnsFrom": [
|
||||
"deck_id"
|
||||
],
|
||||
"columnsTo": [
|
||||
"id"
|
||||
],
|
||||
"onDelete": "cascade",
|
||||
"onUpdate": "no action"
|
||||
},
|
||||
"deck_terms_term_id_terms_id_fk": {
|
||||
"name": "deck_terms_term_id_terms_id_fk",
|
||||
"tableFrom": "deck_terms",
|
||||
"tableTo": "terms",
|
||||
"columnsFrom": [
|
||||
"term_id"
|
||||
],
|
||||
"columnsTo": [
|
||||
"id"
|
||||
],
|
||||
"onDelete": "cascade",
|
||||
"onUpdate": "no action"
|
||||
}
|
||||
},
|
||||
"compositePrimaryKeys": {
|
||||
"deck_terms_deck_id_term_id_pk": {
|
||||
"name": "deck_terms_deck_id_term_id_pk",
|
||||
"columns": [
|
||||
"deck_id",
|
||||
"term_id"
|
||||
]
|
||||
}
|
||||
},
|
||||
"uniqueConstraints": {},
|
||||
"policies": {},
|
||||
"checkConstraints": {},
|
||||
"isRLSEnabled": false
|
||||
},
|
||||
"public.decks": {
|
||||
"name": "decks",
|
||||
"schema": "",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "uuid",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"default": "gen_random_uuid()"
|
||||
},
|
||||
"name": {
|
||||
"name": "name",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"description": {
|
||||
"name": "description",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false
|
||||
},
|
||||
"validated_for_languages": {
|
||||
"name": "validated_for_languages",
|
||||
"type": "varchar(10)[]",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "'{}'"
|
||||
},
|
||||
"is_public": {
|
||||
"name": "is_public",
|
||||
"type": "boolean",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": false
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "now()"
|
||||
}
|
||||
},
|
||||
"indexes": {},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {
|
||||
"unique_deck_name": {
|
||||
"name": "unique_deck_name",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"name"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
"checkConstraints": {
|
||||
"validated_languages_check": {
|
||||
"name": "validated_languages_check",
|
||||
"value": "validated_for_languages <@ ARRAY['en', 'it']::varchar[]"
|
||||
}
|
||||
},
|
||||
"isRLSEnabled": false
|
||||
},
|
||||
"public.language_pairs": {
|
||||
"name": "language_pairs",
|
||||
"schema": "",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "uuid",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"default": "gen_random_uuid()"
|
||||
},
|
||||
"source_language": {
|
||||
"name": "source_language",
|
||||
"type": "varchar(10)",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"target_language": {
|
||||
"name": "target_language",
|
||||
"type": "varchar(10)",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"label": {
|
||||
"name": "label",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false
|
||||
},
|
||||
"active": {
|
||||
"name": "active",
|
||||
"type": "boolean",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": true
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "now()"
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"idx_pairs_active": {
|
||||
"name": "idx_pairs_active",
|
||||
"columns": [
|
||||
{
|
||||
"expression": "active",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
},
|
||||
{
|
||||
"expression": "source_language",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
},
|
||||
{
|
||||
"expression": "target_language",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
}
|
||||
],
|
||||
"isUnique": false,
|
||||
"concurrently": false,
|
||||
"method": "btree",
|
||||
"with": {}
|
||||
}
|
||||
},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {
|
||||
"unique_source_target": {
|
||||
"name": "unique_source_target",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"source_language",
|
||||
"target_language"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
"checkConstraints": {
|
||||
"source_language_check": {
|
||||
"name": "source_language_check",
|
||||
"value": "\"language_pairs\".\"source_language\" IN ('en', 'it')"
|
||||
},
|
||||
"target_language_check": {
|
||||
"name": "target_language_check",
|
||||
"value": "\"language_pairs\".\"target_language\" IN ('en', 'it')"
|
||||
},
|
||||
"no_self_pair": {
|
||||
"name": "no_self_pair",
|
||||
"value": "\"language_pairs\".\"source_language\" != \"language_pairs\".\"target_language\""
|
||||
}
|
||||
},
|
||||
"isRLSEnabled": false
|
||||
},
|
||||
"public.term_glosses": {
|
||||
"name": "term_glosses",
|
||||
"schema": "",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "uuid",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"default": "gen_random_uuid()"
|
||||
},
|
||||
"term_id": {
|
||||
"name": "term_id",
|
||||
"type": "uuid",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"language_code": {
|
||||
"name": "language_code",
|
||||
"type": "varchar(10)",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"text": {
|
||||
"name": "text",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "now()"
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"idx_term_glosses_term": {
|
||||
"name": "idx_term_glosses_term",
|
||||
"columns": [
|
||||
{
|
||||
"expression": "term_id",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
}
|
||||
],
|
||||
"isUnique": false,
|
||||
"concurrently": false,
|
||||
"method": "btree",
|
||||
"with": {}
|
||||
}
|
||||
},
|
||||
"foreignKeys": {
|
||||
"term_glosses_term_id_terms_id_fk": {
|
||||
"name": "term_glosses_term_id_terms_id_fk",
|
||||
"tableFrom": "term_glosses",
|
||||
"tableTo": "terms",
|
||||
"columnsFrom": [
|
||||
"term_id"
|
||||
],
|
||||
"columnsTo": [
|
||||
"id"
|
||||
],
|
||||
"onDelete": "cascade",
|
||||
"onUpdate": "no action"
|
||||
}
|
||||
},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {
|
||||
"unique_term_gloss": {
|
||||
"name": "unique_term_gloss",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"term_id",
|
||||
"language_code",
|
||||
"text"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
"checkConstraints": {},
|
||||
"isRLSEnabled": false
|
||||
},
|
||||
"public.terms": {
|
||||
"name": "terms",
|
||||
"schema": "",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "uuid",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"default": "gen_random_uuid()"
|
||||
},
|
||||
"synset_id": {
|
||||
"name": "synset_id",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"pos": {
|
||||
"name": "pos",
|
||||
"type": "varchar(20)",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "now()"
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"idx_terms_pos": {
|
||||
"name": "idx_terms_pos",
|
||||
"columns": [
|
||||
{
|
||||
"expression": "pos",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
}
|
||||
],
|
||||
"isUnique": false,
|
||||
"concurrently": false,
|
||||
"method": "btree",
|
||||
"with": {}
|
||||
}
|
||||
},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {
|
||||
"terms_synset_id_unique": {
|
||||
"name": "terms_synset_id_unique",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"synset_id"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
"checkConstraints": {
|
||||
"pos_check": {
|
||||
"name": "pos_check",
|
||||
"value": "\"terms\".\"pos\" IN ('noun')"
|
||||
}
|
||||
},
|
||||
"isRLSEnabled": false
|
||||
},
|
||||
"public.translations": {
|
||||
"name": "translations",
|
||||
"schema": "",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "uuid",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"default": "gen_random_uuid()"
|
||||
},
|
||||
"term_id": {
|
||||
"name": "term_id",
|
||||
"type": "uuid",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"language_code": {
|
||||
"name": "language_code",
|
||||
"type": "varchar(10)",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"text": {
|
||||
"name": "text",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "now()"
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"idx_translations_lang": {
|
||||
"name": "idx_translations_lang",
|
||||
"columns": [
|
||||
{
|
||||
"expression": "language_code",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
},
|
||||
{
|
||||
"expression": "term_id",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
}
|
||||
],
|
||||
"isUnique": false,
|
||||
"concurrently": false,
|
||||
"method": "btree",
|
||||
"with": {}
|
||||
}
|
||||
},
|
||||
"foreignKeys": {
|
||||
"translations_term_id_terms_id_fk": {
|
||||
"name": "translations_term_id_terms_id_fk",
|
||||
"tableFrom": "translations",
|
||||
"tableTo": "terms",
|
||||
"columnsFrom": [
|
||||
"term_id"
|
||||
],
|
||||
"columnsTo": [
|
||||
"id"
|
||||
],
|
||||
"onDelete": "cascade",
|
||||
"onUpdate": "no action"
|
||||
}
|
||||
},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {
|
||||
"unique_translations": {
|
||||
"name": "unique_translations",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"term_id",
|
||||
"language_code",
|
||||
"text"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
"checkConstraints": {},
|
||||
"isRLSEnabled": false
|
||||
},
|
||||
"public.users": {
|
||||
"name": "users",
|
||||
"schema": "",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "uuid",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"default": "gen_random_uuid()"
|
||||
},
|
||||
"openauth_sub": {
|
||||
"name": "openauth_sub",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"email": {
|
||||
"name": "email",
|
||||
"type": "varchar(255)",
|
||||
"primaryKey": false,
|
||||
"notNull": false
|
||||
},
|
||||
"display_name": {
|
||||
"name": "display_name",
|
||||
"type": "varchar(100)",
|
||||
"primaryKey": false,
|
||||
"notNull": false
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "now()"
|
||||
},
|
||||
"last_login_at": {
|
||||
"name": "last_login_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": false
|
||||
}
|
||||
},
|
||||
"indexes": {},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {
|
||||
"users_openauth_sub_unique": {
|
||||
"name": "users_openauth_sub_unique",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"openauth_sub"
|
||||
]
|
||||
},
|
||||
"users_email_unique": {
|
||||
"name": "users_email_unique",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"email"
|
||||
]
|
||||
},
|
||||
"users_display_name_unique": {
|
||||
"name": "users_display_name_unique",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"display_name"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
"checkConstraints": {},
|
||||
"isRLSEnabled": false
|
||||
}
|
||||
},
|
||||
"enums": {},
|
||||
"schemas": {},
|
||||
"sequences": {},
|
||||
"roles": {},
|
||||
"policies": {},
|
||||
"views": {},
|
||||
"_meta": {
|
||||
"columns": {},
|
||||
"schemas": {},
|
||||
"tables": {}
|
||||
}
|
||||
}
|
||||
|
|
@ -5,15 +5,8 @@
|
|||
{
|
||||
"idx": 0,
|
||||
"version": "7",
|
||||
"when": 1774721919883,
|
||||
"tag": "0000_bitter_turbo",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 1,
|
||||
"version": "7",
|
||||
"when": 1774970553186,
|
||||
"tag": "0001_medical_fabian_cortez",
|
||||
"when": 1775053965903,
|
||||
"tag": "0000_faithful_oracle",
|
||||
"breakpoints": true
|
||||
}
|
||||
]
|
||||
|
|
|
|||
|
|
@ -6,7 +6,9 @@
|
|||
"scripts": {
|
||||
"build": "tsc",
|
||||
"generate": "drizzle-kit generate",
|
||||
"migrate": "drizzle-kit migrate"
|
||||
"migrate": "drizzle-kit migrate",
|
||||
"db:seed": "npx tsx src/seeding-datafiles.ts",
|
||||
"db:build-deck": "npx tsx src/generating-deck.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"@glossa/shared": "workspace:*",
|
||||
|
|
|
|||
34
packages/db/src/data/wordlists/top1000englishnouns-missing
Normal file
34
packages/db/src/data/wordlists/top1000englishnouns-missing
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
a
|
||||
other
|
||||
us
|
||||
may
|
||||
st
|
||||
paul
|
||||
new
|
||||
software
|
||||
oxford
|
||||
english
|
||||
mary
|
||||
japan
|
||||
while
|
||||
pp
|
||||
membership
|
||||
manchester
|
||||
tony
|
||||
alan
|
||||
jones
|
||||
un
|
||||
northern
|
||||
simon
|
||||
behalf
|
||||
co
|
||||
graham
|
||||
joe
|
||||
guy
|
||||
lewis
|
||||
jane
|
||||
taylor
|
||||
co-operation
|
||||
travel
|
||||
self
|
||||
thatcher
|
||||
|
|
@ -144,11 +144,11 @@ export const decks = pgTable(
|
|||
),
|
||||
check(
|
||||
"validated_languages_check",
|
||||
sql`validated_for_languages <@ ARRAY[${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))}]::varchar[]`,
|
||||
sql`validated_languages <@ ARRAY[${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))}]::varchar[]`,
|
||||
),
|
||||
check(
|
||||
"validated_languages_excludes_source",
|
||||
sql`NOT (${table.source_language} = ANY(validated_for_languages))`,
|
||||
sql`NOT (${table.source_language} = ANY(${table.validated_languages}))`,
|
||||
),
|
||||
unique("unique_deck_name").on(table.name, table.source_language),
|
||||
],
|
||||
|
|
|
|||
302
packages/db/src/generating-deck.ts
Normal file
302
packages/db/src/generating-deck.ts
Normal file
|
|
@ -0,0 +1,302 @@
|
|||
/*
|
||||
*
|
||||
* Builds the "top English nouns" deck from a curated wordlist of the 1000 most
|
||||
* frequently used English nouns. The deck has English as its source language —
|
||||
* meaning it was curated from an English-centric frequency list, and a separate
|
||||
* deck would be needed for other source languages. For each word in the list, all
|
||||
* matching term IDs are looked up in the database via the translations table
|
||||
* (language: "en", POS: "noun") — homonyms are intentionally included as separate
|
||||
* cards since the quiz UI displays a gloss alongside each word. Words from the
|
||||
* list that have no DB match are skipped and written to a file for future
|
||||
* reference. The script is idempotent: if the deck already exists, only terms
|
||||
* present in the source but missing from the deck are added; terms already in the
|
||||
* deck are left untouched; terms in the deck but absent from the source are never
|
||||
* removed. After resolving all matched terms, the script determines
|
||||
* validated_for_languages by checking which languages — excluding the source
|
||||
* language — have full translation coverage across all matched terms, and updates
|
||||
* the array on every run.
|
||||
*/
|
||||
|
||||
/*
|
||||
* roadmap
|
||||
*
|
||||
* [x] Setup - hardcoded path, name, description, source language, POS
|
||||
* [x] Read wordlist - load and deduplicate the 1000 nouns
|
||||
* [x] Query terms - match to database, collect all term IDs per word (including homonyms)
|
||||
* [x] Write missing words to file for future reference
|
||||
* [x] Determine validated_languages - find languages (excluding source) with full coverage across all matched terms
|
||||
* [x] Check idempotency - if deck exists, diff matched terms against existing deck_terms
|
||||
* [x] Create deck if it doesn't exist - insert with name, source_language, validated_languages
|
||||
* [x] Add new terms - insert only term IDs present in source but missing from deck
|
||||
* [x] Update validated_languages - recalculate and update on every run
|
||||
* [x] Report - summary of words found, missing, added, and validated languages
|
||||
*/
|
||||
|
||||
import fs from "node:fs/promises";
|
||||
import { db } from "@glossa/db";
|
||||
import { translations, terms, decks, deck_terms } from "@glossa/db/schema";
|
||||
import { inArray, and, eq } from "drizzle-orm";
|
||||
import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared";
|
||||
|
||||
const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
|
||||
const nameOfDeck = "top english nouns";
|
||||
const descriptionOfDeck =
|
||||
"Most frequently used English nouns for vocabulary practice";
|
||||
const sourceLanguage = "en";
|
||||
const sourcePOS = "noun";
|
||||
|
||||
// new Set() automatically discards duplicate values,
|
||||
// and spreading it back with ... converts it to a plain array again.
|
||||
// So if "bank" appears twice in the file,
|
||||
// the resulting array will only contain it once.
|
||||
const readingFromWordlist = async () => {
|
||||
const raw = await fs.readFile(pathToWordlist, "utf8");
|
||||
const words = [
|
||||
...new Set(
|
||||
raw
|
||||
.split("\n")
|
||||
.map((w) => w.trim().toLowerCase())
|
||||
.filter(Boolean),
|
||||
),
|
||||
];
|
||||
return words;
|
||||
};
|
||||
|
||||
const checkingSourceWordsAgainstDB = async (words: string[]) => {
|
||||
const rows = await db
|
||||
.select({ text: translations.text, termId: translations.term_id })
|
||||
.from(translations)
|
||||
.innerJoin(terms, eq(translations.term_id, terms.id))
|
||||
.where(
|
||||
and(
|
||||
inArray(translations.text, words),
|
||||
eq(translations.language_code, sourceLanguage),
|
||||
eq(terms.pos, sourcePOS),
|
||||
),
|
||||
);
|
||||
|
||||
const wordToTermIds = new Map<string, string[]>();
|
||||
for (const row of rows) {
|
||||
const word = row.text.toLowerCase();
|
||||
const existing = wordToTermIds.get(word) ?? [];
|
||||
wordToTermIds.set(word, [...existing, row.termId]);
|
||||
}
|
||||
const termIds = [...new Set(Array.from(wordToTermIds.values()).flat())];
|
||||
const missingWords = words.filter((w) => !wordToTermIds.has(w));
|
||||
|
||||
return { termIds, missingWords };
|
||||
};
|
||||
|
||||
const writeMissingWordsToFile = async (missingWords: string[]) => {
|
||||
const outputPath = `${pathToWordlist}-missing`;
|
||||
await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
|
||||
};
|
||||
|
||||
const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
|
||||
// create array of language code from the supported languages
|
||||
// remove source language from it
|
||||
const languages = SUPPORTED_LANGUAGE_CODES.filter(
|
||||
(language) => language !== sourceLanguage,
|
||||
);
|
||||
const validatedLanguages: string[] = [];
|
||||
// For each remaining language, count how many of the termIds have a translation in that language
|
||||
for (const language of languages) {
|
||||
const rows = await db
|
||||
.selectDistinct({ termId: translations.term_id })
|
||||
.from(translations)
|
||||
.where(
|
||||
and(
|
||||
inArray(translations.term_id, termIds),
|
||||
eq(translations.language_code, language),
|
||||
),
|
||||
);
|
||||
if (rows.length === termIds.length) {
|
||||
validatedLanguages.push(language);
|
||||
}
|
||||
}
|
||||
|
||||
// If the count equals termIds.length → full coverage → include in result
|
||||
// Return the array of fully covered languages
|
||||
return validatedLanguages;
|
||||
};
|
||||
|
||||
// Check idempotency — if deck exists, diff matched terms against existing deck_terms
|
||||
const findExistingDeck = async () => {
|
||||
const existing = await db
|
||||
.select({ id: decks.id, validatedForLanguages: decks.validated_languages })
|
||||
.from(decks)
|
||||
.where(
|
||||
and(
|
||||
eq(decks.name, nameOfDeck),
|
||||
eq(decks.source_language, sourceLanguage),
|
||||
),
|
||||
);
|
||||
return existing[0] ?? null;
|
||||
};
|
||||
|
||||
// logging translation coverage per language across all matched terms
|
||||
const logLanguageCoverage = async (termIds: string[]) => {
|
||||
const languages = SUPPORTED_LANGUAGE_CODES.filter(
|
||||
(language) => language !== sourceLanguage,
|
||||
);
|
||||
for (const language of languages) {
|
||||
const rows = await db
|
||||
.selectDistinct({ termId: translations.term_id })
|
||||
.from(translations)
|
||||
.where(
|
||||
and(
|
||||
inArray(translations.term_id, termIds),
|
||||
eq(translations.language_code, language),
|
||||
),
|
||||
);
|
||||
console.log(
|
||||
` ${language}: ${rows.length} / ${termIds.length} terms covered`,
|
||||
);
|
||||
|
||||
const coveredIds = new Set(rows.map((r) => r.termId));
|
||||
const missingTermIds = termIds.filter((id) => !coveredIds.has(id));
|
||||
console.log(` missing term IDs count:`, missingTermIds.length);
|
||||
|
||||
const missingEnglish = await db
|
||||
.selectDistinct({ text: translations.text })
|
||||
.from(translations)
|
||||
.where(
|
||||
and(
|
||||
inArray(translations.term_id, missingTermIds),
|
||||
eq(translations.language_code, "en"),
|
||||
),
|
||||
);
|
||||
console.log(
|
||||
` missing words in ${language}:`,
|
||||
missingEnglish.map((r) => r.text),
|
||||
"\n",
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
// creating a deck
|
||||
const createDeck = async (validatedLanguages: string[]) => {
|
||||
const result = await db
|
||||
.insert(decks)
|
||||
.values({
|
||||
name: nameOfDeck,
|
||||
description: descriptionOfDeck,
|
||||
source_language: sourceLanguage,
|
||||
validated_languages: validatedLanguages,
|
||||
is_public: false,
|
||||
})
|
||||
.returning({ id: decks.id });
|
||||
const created = result[0];
|
||||
if (!created) throw new Error("Failed to create deck: no row returned");
|
||||
return created.id;
|
||||
};
|
||||
|
||||
// Diffs termIds against the existing deck_terms for this deck and inserts only
|
||||
// the ones not already present. Returns the count of newly inserted terms.
|
||||
const addTermsToDeck = async (
|
||||
deckId: string,
|
||||
termIds: string[],
|
||||
): Promise<number> => {
|
||||
const existingRows = await db
|
||||
.select({ termId: deck_terms.term_id })
|
||||
.from(deck_terms)
|
||||
.where(eq(deck_terms.deck_id, deckId));
|
||||
|
||||
const existingTermIds = new Set(existingRows.map((r) => r.termId));
|
||||
const newTermIds = termIds.filter((id) => !existingTermIds.has(id));
|
||||
|
||||
if (newTermIds.length === 0) return 0;
|
||||
|
||||
await db
|
||||
.insert(deck_terms)
|
||||
.values(newTermIds.map((termId) => ({ deck_id: deckId, term_id: termId })));
|
||||
|
||||
return newTermIds.length;
|
||||
};
|
||||
|
||||
// Recalculates and persists validated_languages on every run so the field stays
|
||||
// accurate as translation coverage grows over time.
|
||||
const updateValidatedLanguages = async (
|
||||
deckId: string,
|
||||
validatedLanguages: string[],
|
||||
): Promise<void> => {
|
||||
await db
|
||||
.update(decks)
|
||||
.set({ validated_languages: validatedLanguages })
|
||||
.where(eq(decks.id, deckId));
|
||||
};
|
||||
|
||||
const main = async () => {
|
||||
console.log("📖 Reading word list...");
|
||||
const sourceWords = await readingFromWordlist();
|
||||
console.log(` ${sourceWords.length} words loaded\n`);
|
||||
|
||||
console.log("🔍 Checking against database...");
|
||||
const { termIds, missingWords } =
|
||||
await checkingSourceWordsAgainstDB(sourceWords);
|
||||
console.log(` ${termIds.length} terms found`);
|
||||
console.log(` ${missingWords.length} words not found in DB\n`);
|
||||
|
||||
console.log("🖊️ Writing missing words to file...\n");
|
||||
await writeMissingWordsToFile(missingWords);
|
||||
|
||||
console.log("✅ Validating languages...");
|
||||
const validatedLanguages = await validateLanguages(sourceLanguage, termIds);
|
||||
console.log(
|
||||
` Validated languages: ${JSON.stringify(validatedLanguages)}\n`,
|
||||
);
|
||||
|
||||
console.log("🔬 Language coverage breakdown...");
|
||||
await logLanguageCoverage(termIds);
|
||||
|
||||
console.log("🃏 Looking for existing deck...");
|
||||
const existingDeck = await findExistingDeck();
|
||||
|
||||
let deckId: string;
|
||||
let isNewDeck: boolean;
|
||||
|
||||
if (!existingDeck) {
|
||||
console.log(" No existing deck found, will create one\n");
|
||||
console.log("🆕 Creating deck...");
|
||||
deckId = await createDeck(validatedLanguages);
|
||||
console.log(` Deck created with id: ${deckId}\n`);
|
||||
isNewDeck = true;
|
||||
} else {
|
||||
console.log(` Found existing deck with id: ${existingDeck.id}\n`);
|
||||
deckId = existingDeck.id;
|
||||
isNewDeck = false;
|
||||
}
|
||||
|
||||
console.log("➕ Adding terms to deck...");
|
||||
const addedCount = await addTermsToDeck(deckId, termIds);
|
||||
const alreadyPresentCount = termIds.length - addedCount;
|
||||
console.log(` ${addedCount} terms added`);
|
||||
console.log(` ${alreadyPresentCount} terms already in deck\n`);
|
||||
|
||||
if (!isNewDeck) {
|
||||
console.log("🔄 Updating validated languages...");
|
||||
await updateValidatedLanguages(deckId, validatedLanguages);
|
||||
console.log(` Updated to: ${JSON.stringify(validatedLanguages)}\n`);
|
||||
}
|
||||
|
||||
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
console.log("📊 Summary");
|
||||
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
console.log(` Words loaded from wordlist : ${sourceWords.length}`);
|
||||
console.log(
|
||||
` Words matched in DB : ${sourceWords.length - missingWords.length}`,
|
||||
);
|
||||
console.log(` Words not found in DB : ${missingWords.length}`);
|
||||
console.log(` Term IDs resolved : ${termIds.length}`);
|
||||
console.log(` Terms added to deck : ${addedCount}`);
|
||||
console.log(` Terms already in deck : ${alreadyPresentCount}`);
|
||||
console.log(
|
||||
` Validated languages : ${validatedLanguages.length > 0 ? validatedLanguages.join(", ") : "none"}`,
|
||||
);
|
||||
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
};
|
||||
|
||||
main().catch((error) => {
|
||||
console.error(error);
|
||||
process.exit(1);
|
||||
});
|
||||
|
|
@ -1,162 +0,0 @@
|
|||
/*
|
||||
*
|
||||
* Builds the "top English nouns" deck from a curated wordlist of the 1000 most
|
||||
* frequently used English nouns. The deck has English as its source language —
|
||||
* meaning it was curated from an English-centric frequency list, and a separate
|
||||
* deck would be needed for other source languages. For each word in the list, all
|
||||
* matching term IDs are looked up in the database via the translations table
|
||||
* (language: "en", POS: "noun") — homonyms are intentionally included as separate
|
||||
* cards since the quiz UI displays a gloss alongside each word. Words from the
|
||||
* list that have no DB match are skipped and written to a file for future
|
||||
* reference. The script is idempotent: if the deck already exists, only terms
|
||||
* present in the source but missing from the deck are added; terms already in the
|
||||
* deck are left untouched; terms in the deck but absent from the source are never
|
||||
* removed. After resolving all matched terms, the script determines
|
||||
* validated_for_languages by checking which languages — excluding the source
|
||||
* language — have full translation coverage across all matched terms, and updates
|
||||
* the array on every run.
|
||||
*/
|
||||
|
||||
/*
|
||||
* roadmap
|
||||
*
|
||||
* [x] Setup — hardcoded path, name, description, source language, POS
|
||||
* [x] Read wordlist — load and deduplicate the 1000 nouns
|
||||
* [x] Query terms — match to database, collect all term IDs per word (including homonyms)
|
||||
* [x] Write missing words to file for future reference
|
||||
* [x] Determine validated_for_languages — find languages (excluding source) with full coverage across all matched terms
|
||||
* [ ] Check idempotency — if deck exists, diff matched terms against existing deck_terms
|
||||
* [ ] Create deck if it doesn't exist — insert with name, source_language, validated_for_languages
|
||||
* [ ] Add new terms — insert only term IDs present in source but missing from deck
|
||||
* [ ] Update validated_for_languages — recalculate and update on every run
|
||||
* [ ] Report — summary of words found, missing, added, and validated languages
|
||||
*/
|
||||
|
||||
import fs from "node:fs/promises";
|
||||
import { db } from "@glossa/db";
|
||||
import { translations, terms, decks } from "@glossa/db/schema";
|
||||
import { inArray, and, eq } from "drizzle-orm";
|
||||
import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared";
|
||||
|
||||
const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
|
||||
const nameOfDeck = "top english nouns";
|
||||
const descriptionOfDeck =
|
||||
"Most frequently used English nouns for vocabulary practice";
|
||||
const sourceLanguage = "en";
|
||||
const sourcePOS = "noun";
|
||||
|
||||
// new Set() automatically discards duplicate values,
|
||||
// and spreading it back with ... converts it to a plain array again.
|
||||
// So if "bank" appears twice in the file,
|
||||
// the resulting array will only contain it once.
|
||||
const readingFromWordlist = async () => {
|
||||
const raw = await fs.readFile(pathToWordlist, "utf8");
|
||||
const words = [
|
||||
...new Set(
|
||||
raw
|
||||
.split("\n")
|
||||
.map((w) => w.trim().toLowerCase())
|
||||
.filter(Boolean),
|
||||
),
|
||||
];
|
||||
return words;
|
||||
};
|
||||
|
||||
const checkingSourceWordsAgainstDB = async (words: string[]) => {
|
||||
const rows = await db
|
||||
.select({ text: translations.text, termId: translations.term_id })
|
||||
.from(translations)
|
||||
.innerJoin(terms, eq(translations.term_id, terms.id))
|
||||
.where(
|
||||
and(
|
||||
inArray(translations.text, words),
|
||||
eq(translations.language_code, sourceLanguage),
|
||||
eq(terms.pos, sourcePOS),
|
||||
),
|
||||
);
|
||||
|
||||
const wordToTermIds = new Map<string, string[]>();
|
||||
for (const row of rows) {
|
||||
const word = row.text.toLowerCase();
|
||||
const existing = wordToTermIds.get(word) ?? [];
|
||||
wordToTermIds.set(word, [...existing, row.termId]);
|
||||
}
|
||||
const termIds = Array.from(wordToTermIds.values()).flat();
|
||||
const missingWords = words.filter((w) => !wordToTermIds.has(w));
|
||||
|
||||
return { termIds, missingWords };
|
||||
};
|
||||
|
||||
const writeMissingWordsToFile = async (missingWords: string[]) => {
|
||||
const outputPath = `${pathToWordlist}-missing`;
|
||||
await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
|
||||
};
|
||||
|
||||
const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
|
||||
// create array of language code from the supported languages
|
||||
// remove source language from it
|
||||
const languages = SUPPORTED_LANGUAGE_CODES.filter(
|
||||
(language) => language !== sourceLanguage,
|
||||
);
|
||||
const validatedLanguages: string[] = [];
|
||||
// For each remaining language, count how many of the termIds have a translation in that language
|
||||
for (const language of languages) {
|
||||
const rows = await db
|
||||
.select({ termId: translations.term_id })
|
||||
.from(translations)
|
||||
.where(
|
||||
and(
|
||||
inArray(translations.term_id, termIds),
|
||||
eq(translations.language_code, language),
|
||||
),
|
||||
);
|
||||
if (rows.length === termIds.length) {
|
||||
validatedLanguages.push(language);
|
||||
}
|
||||
}
|
||||
|
||||
// If the count equals termIds.length → full coverage → include in result
|
||||
// Return the array of fully covered languages
|
||||
return validatedLanguages;
|
||||
};
|
||||
|
||||
const findExistingDeck = async () => {
|
||||
const existing = await db
|
||||
.select({ id: decks.id, validatedForLanguages: decks.validated_languages })
|
||||
.from(decks)
|
||||
.where(
|
||||
and(
|
||||
eq(decks.name, nameOfDeck),
|
||||
eq(decks.source_language, sourceLanguage),
|
||||
),
|
||||
);
|
||||
return existing[0] ?? null;
|
||||
};
|
||||
|
||||
const main = async () => {
|
||||
// reading from source file
|
||||
console.log("📖 Reading word list...");
|
||||
const sourceWords = await readingFromWordlist();
|
||||
console.log(` ${sourceWords.length} words loaded\n`);
|
||||
|
||||
// checking if sourceWords exist in database
|
||||
console.log("🔍 Checking against database...");
|
||||
const { termIds, missingWords } =
|
||||
await checkingSourceWordsAgainstDB(sourceWords);
|
||||
console.log("words found in db: ", termIds.length);
|
||||
console.log("words NOT found in db: ", missingWords.length, "\n");
|
||||
|
||||
// writing missing words to file
|
||||
console.log("writing missing words to file...\n");
|
||||
await writeMissingWordsToFile(missingWords);
|
||||
|
||||
// validating languages
|
||||
console.log("validation languages...");
|
||||
const validatedLanguages = await validateLanguages(sourceLanguage, termIds);
|
||||
console.log("validated these languages: ", validatedLanguages, "\n");
|
||||
};
|
||||
|
||||
main().catch((error) => {
|
||||
console.error(error);
|
||||
process.exit(1);
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue