diff --git a/packages/db/drizzle/0001_medical_fabian_cortez.sql b/packages/db/drizzle/0001_medical_fabian_cortez.sql new file mode 100644 index 0000000..f202780 --- /dev/null +++ b/packages/db/drizzle/0001_medical_fabian_cortez.sql @@ -0,0 +1,13 @@ +ALTER TABLE "decks" DROP CONSTRAINT "unique_deck_name";--> statement-breakpoint +ALTER TABLE "decks" DROP CONSTRAINT "decks_language_pair_id_language_pairs_id_fk"; +--> statement-breakpoint +ALTER TABLE "decks" DROP CONSTRAINT "decks_created_by_users_id_fk"; +--> statement-breakpoint +DROP INDEX "idx_decks_created_by";--> statement-breakpoint +DROP INDEX "idx_decks_language_pair";--> statement-breakpoint +ALTER TABLE "decks" ADD COLUMN "validated_for_languages" varchar(10)[] DEFAULT '{}' NOT NULL;--> statement-breakpoint +ALTER TABLE "deck_terms" DROP COLUMN "position";--> statement-breakpoint +ALTER TABLE "decks" DROP COLUMN "language_pair_id";--> statement-breakpoint +ALTER TABLE "decks" DROP COLUMN "created_by";--> statement-breakpoint +ALTER TABLE "decks" ADD CONSTRAINT "unique_deck_name" UNIQUE("name");--> statement-breakpoint +ALTER TABLE "decks" ADD CONSTRAINT "validated_languages_check" CHECK (validated_for_languages <@ ARRAY['en', 'it']::varchar[]); \ No newline at end of file diff --git a/packages/db/drizzle/meta/0001_snapshot.json b/packages/db/drizzle/meta/0001_snapshot.json new file mode 100644 index 0000000..41a8dbd --- /dev/null +++ b/packages/db/drizzle/meta/0001_snapshot.json @@ -0,0 +1,587 @@ +{ + "id": "d6bed73d-ee69-44b1-a3ce-3ae25898a6f0", + "prevId": "5830ce3b-dc0e-44a7-83d6-bc74016ca4fd", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.deck_terms": { + "name": "deck_terms", + "schema": "", + "columns": { + "deck_id": { + "name": "deck_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "term_id": { + "name": "term_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "added_at": { + "name": "added_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_deck_terms_term": { + "name": "idx_deck_terms_term", + "columns": [ + { + "expression": "term_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "deck_terms_deck_id_decks_id_fk": { + "name": "deck_terms_deck_id_decks_id_fk", + "tableFrom": "deck_terms", + "tableTo": "decks", + "columnsFrom": [ + "deck_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "deck_terms_term_id_terms_id_fk": { + "name": "deck_terms_term_id_terms_id_fk", + "tableFrom": "deck_terms", + "tableTo": "terms", + "columnsFrom": [ + "term_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "deck_terms_deck_id_term_id_pk": { + "name": "deck_terms_deck_id_term_id_pk", + "columns": [ + "deck_id", + "term_id" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.decks": { + "name": "decks", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "validated_for_languages": { + "name": "validated_for_languages", + "type": "varchar(10)[]", + "primaryKey": false, + "notNull": true, + "default": "'{}'" + }, + "is_public": { + "name": "is_public", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_deck_name": { + "name": "unique_deck_name", + "nullsNotDistinct": false, + "columns": [ + "name" + ] + } + }, + "policies": {}, + "checkConstraints": { + "validated_languages_check": { + "name": "validated_languages_check", + "value": "validated_for_languages <@ ARRAY['en', 'it']::varchar[]" + } + }, + "isRLSEnabled": false + }, + "public.language_pairs": { + "name": "language_pairs", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "source_language": { + "name": "source_language", + "type": "varchar(10)", + "primaryKey": false, + "notNull": true + }, + "target_language": { + "name": "target_language", + "type": "varchar(10)", + "primaryKey": false, + "notNull": true + }, + "label": { + "name": "label", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "active": { + "name": "active", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_pairs_active": { + "name": "idx_pairs_active", + "columns": [ + { + "expression": "active", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "source_language", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "target_language", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_source_target": { + "name": "unique_source_target", + "nullsNotDistinct": false, + "columns": [ + "source_language", + "target_language" + ] + } + }, + "policies": {}, + "checkConstraints": { + "source_language_check": { + "name": "source_language_check", + "value": "\"language_pairs\".\"source_language\" IN ('en', 'it')" + }, + "target_language_check": { + "name": "target_language_check", + "value": "\"language_pairs\".\"target_language\" IN ('en', 'it')" + }, + "no_self_pair": { + "name": "no_self_pair", + "value": "\"language_pairs\".\"source_language\" != \"language_pairs\".\"target_language\"" + } + }, + "isRLSEnabled": false + }, + "public.term_glosses": { + "name": "term_glosses", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "term_id": { + "name": "term_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "language_code": { + "name": "language_code", + "type": "varchar(10)", + "primaryKey": false, + "notNull": true + }, + "text": { + "name": "text", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_term_glosses_term": { + "name": "idx_term_glosses_term", + "columns": [ + { + "expression": "term_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "term_glosses_term_id_terms_id_fk": { + "name": "term_glosses_term_id_terms_id_fk", + "tableFrom": "term_glosses", + "tableTo": "terms", + "columnsFrom": [ + "term_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_term_gloss": { + "name": "unique_term_gloss", + "nullsNotDistinct": false, + "columns": [ + "term_id", + "language_code", + "text" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.terms": { + "name": "terms", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "synset_id": { + "name": "synset_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "pos": { + "name": "pos", + "type": "varchar(20)", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_terms_pos": { + "name": "idx_terms_pos", + "columns": [ + { + "expression": "pos", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "terms_synset_id_unique": { + "name": "terms_synset_id_unique", + "nullsNotDistinct": false, + "columns": [ + "synset_id" + ] + } + }, + "policies": {}, + "checkConstraints": { + "pos_check": { + "name": "pos_check", + "value": "\"terms\".\"pos\" IN ('noun')" + } + }, + "isRLSEnabled": false + }, + "public.translations": { + "name": "translations", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "term_id": { + "name": "term_id", + "type": "uuid", + "primaryKey": false, + "notNull": true + }, + "language_code": { + "name": "language_code", + "type": "varchar(10)", + "primaryKey": false, + "notNull": true + }, + "text": { + "name": "text", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_translations_lang": { + "name": "idx_translations_lang", + "columns": [ + { + "expression": "language_code", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "term_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "translations_term_id_terms_id_fk": { + "name": "translations_term_id_terms_id_fk", + "tableFrom": "translations", + "tableTo": "terms", + "columnsFrom": [ + "term_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_translations": { + "name": "unique_translations", + "nullsNotDistinct": false, + "columns": [ + "term_id", + "language_code", + "text" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.users": { + "name": "users", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "uuid", + "primaryKey": true, + "notNull": true, + "default": "gen_random_uuid()" + }, + "openauth_sub": { + "name": "openauth_sub", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "email": { + "name": "email", + "type": "varchar(255)", + "primaryKey": false, + "notNull": false + }, + "display_name": { + "name": "display_name", + "type": "varchar(100)", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "last_login_at": { + "name": "last_login_at", + "type": "timestamp with time zone", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "users_openauth_sub_unique": { + "name": "users_openauth_sub_unique", + "nullsNotDistinct": false, + "columns": [ + "openauth_sub" + ] + }, + "users_email_unique": { + "name": "users_email_unique", + "nullsNotDistinct": false, + "columns": [ + "email" + ] + }, + "users_display_name_unique": { + "name": "users_display_name_unique", + "nullsNotDistinct": false, + "columns": [ + "display_name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": {}, + "schemas": {}, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} \ No newline at end of file diff --git a/packages/db/drizzle/meta/_journal.json b/packages/db/drizzle/meta/_journal.json index e436fd8..ac05b81 100644 --- a/packages/db/drizzle/meta/_journal.json +++ b/packages/db/drizzle/meta/_journal.json @@ -8,6 +8,13 @@ "when": 1774721919883, "tag": "0000_bitter_turbo", "breakpoints": true + }, + { + "idx": 1, + "version": "7", + "when": 1774970553186, + "tag": "0001_medical_fabian_cortez", + "breakpoints": true } ] -} +} \ No newline at end of file diff --git a/packages/db/package.json b/packages/db/package.json index bfd1ad2..3fbdc05 100644 --- a/packages/db/package.json +++ b/packages/db/package.json @@ -5,6 +5,7 @@ "type": "module", "scripts": { "build": "tsc", + "generate": "drizzle-kit generate", "migrate": "drizzle-kit migrate" }, "dependencies": { diff --git a/packages/db/src/check-noun-coverage.ts b/packages/db/src/check-noun-coverage.ts deleted file mode 100644 index b2b4631..0000000 --- a/packages/db/src/check-noun-coverage.ts +++ /dev/null @@ -1,55 +0,0 @@ -import fs from "node:fs/promises"; -import { db } from "@glossa/db"; -import { translations } from "@glossa/db/schema"; -import { inArray } from "drizzle-orm"; - -const wordlistPath = "./src/data/wordlists/top1000englishnouns"; -const unmatchedOutputPath = - "./src/data/wordlists/top1000englishnouns-unmatched"; - -const main = async () => { - // 1. Read and normalise the word list - console.log("πŸ“– Reading word list..."); - const raw = await fs.readFile(wordlistPath, "utf8"); - const words = raw - .split("\n") - .map((w) => w.trim().toLowerCase()) - .filter(Boolean); - console.log(` ${words.length} words loaded\n`); - - // 2. Query DB for matches - console.log("πŸ” Checking against database..."); - const rows = await db - .select({ text: translations.text }) - .from(translations) - .where(inArray(translations.text, words)); - - const matchedSet = new Set(rows.map((r) => r.text.toLowerCase())); - - // 3. Split into matched / unmatched - const matched = words.filter((w) => matchedSet.has(w)); - const unmatched = words.filter((w) => !matchedSet.has(w)); - - // 4. Terminal output - console.log(`βœ… Matched: ${matched.length}/${words.length}`); - console.log(`❌ Unmatched: ${unmatched.length}/${words.length}`); - console.log( - `πŸ“Š Coverage: ${((matched.length / words.length) * 100).toFixed(1)}%\n`, - ); - - if (unmatched.length > 0) { - console.log("❌ Unmatched words:"); - for (const w of unmatched) { - console.log(` ${w}`); - } - } - - // 5. Write unmatched to file - // await fs.writeFile(unmatchedOutputPath, unmatched.join("\n"), "utf8"); - console.log(`\nπŸ’Ύ Unmatched words written to ${unmatchedOutputPath}`); -}; - -main().catch((error) => { - console.error(error); - process.exit(1); -}); diff --git a/packages/db/src/generating-decks.ts b/packages/db/src/generating-decks.ts index 9a15bab..10f84b2 100644 --- a/packages/db/src/generating-decks.ts +++ b/packages/db/src/generating-decks.ts @@ -1,46 +1,77 @@ /* -Parse CLI args β†’ resolve the word list file path -Connect to the database -Read the word list file into an ordered array of strings -Look up the enβ†’it language pair ID from language_pairs -Batch-fetch all matching rows from translations where language_code = 'en' and text IN (words) -Build a word β†’ termId map from the results -Walk the ordered word list β†’ split into hits (word found, capture position) and misses (skip) -Check if a deck with this name already exists β†’ if so, delete its deck_terms then the deck itself -Insert the new decks row -Insert all deck_terms rows in batches (deckId, termId, position) -Log the skipped words -Close the DB connection +- [x] Setup β€” hardcoded path, name, description, source language, POS +- [x] Read wordlist β€” load the 1000 nouns +- [x] Query terms β€” match to database, find which ones have translations +- [ ] Validation β€” determine validated languages from the data (e.g., ["en", "it"] if all matched terms have both) +- [ ] Check idempotency β€” skip if deck exists +- [ ] Create deck β€” insert with discovered validated_for_languages +- [ ] Link terms β€” insert deck_terms +- [ ] Report β€” summary */ import fs from "node:fs/promises"; import { db } from "@glossa/db"; -import { translations } from "@glossa/db/schema"; -import { inArray } from "drizzle-orm"; +import { translations, terms } from "@glossa/db/schema"; +import { inArray, and, eq } from "drizzle-orm"; -const wordlistPath = "./src/data/wordlists/top1000englishnouns"; +const pathToWordlist = "./src/data/wordlists/top1000englishnouns"; +const nameOfDeck = "top english nouns"; +const descriptionOfDeck = + "Most frequently used English nouns for vocabulary practice"; +const sourceLanguage = "en"; +const sourcePOS = "noun"; -const main = async () => { - // Read and normalise the word list - console.log("πŸ“– Reading word list..."); - const raw = await fs.readFile(wordlistPath, "utf8"); +const readingFromWordlist = async () => { + const raw = await fs.readFile(pathToWordlist, "utf8"); const words = raw .split("\n") .map((w) => w.trim().toLowerCase()) .filter(Boolean); - console.log(` ${words.length} words loaded\n`); + return words; +}; - // Query DB for matches - console.log("πŸ” Checking against database..."); +const checkingSourceWordsAgainstDB = async (words: string[]) => { const rows = await db .select({ text: translations.text, termId: translations.term_id }) .from(translations) - .where(inArray(translations.text, words)); + .innerJoin(terms, eq(translations.term_id, terms.id)) + .where( + and( + inArray(translations.text, words), + eq(translations.language_code, sourceLanguage), + eq(terms.pos, sourcePOS), + ), + ); - const matchedSet = new Set(rows.map((r) => r.text.toLowerCase())); - const wordsInDb = words.filter((w) => matchedSet.has(w)); + // map word text to term_id + const wordToTermId = new Map(); + for (const row of rows) { + const word = row.text.toLowerCase(); + if (!wordToTermId.has(word)) { + wordToTermId.set(word, row.termId); + } + } + const termIds = Array.from(wordToTermId.values()); + const missingWords = words.filter((w) => !wordToTermId.has(w)); - console.log("wordsInDb: ", wordsInDb); + return { termIds, missingWords }; +}; + +const writeMissingWordsToFile = async (words: string[]) => {}; + +const main = async () => { + // Read and normalise the word list + console.log("πŸ“– Reading word list..."); + const sourceWords = await readingFromWordlist(); + console.log(` ${sourceWords.length} words loaded\n`); + // check if sourceWords exist in database + console.log("πŸ” Checking against database..."); + const { termIds, missingWords } = + await checkingSourceWordsAgainstDB(sourceWords); + console.log("words found in db: ", termIds.length); + console.log("words NOT found in db: ", missingWords.length); + // write missing words to file + await writeMissingWordsToFile(missingWords); }; main().catch((error) => {