This commit is contained in:
lila 2026-03-31 18:28:29 +02:00
parent 9d1a82bdf0
commit 488f0dab11
6 changed files with 666 additions and 82 deletions

View file

@ -0,0 +1,13 @@
ALTER TABLE "decks" DROP CONSTRAINT "unique_deck_name";--> statement-breakpoint
ALTER TABLE "decks" DROP CONSTRAINT "decks_language_pair_id_language_pairs_id_fk";
--> statement-breakpoint
ALTER TABLE "decks" DROP CONSTRAINT "decks_created_by_users_id_fk";
--> statement-breakpoint
DROP INDEX "idx_decks_created_by";--> statement-breakpoint
DROP INDEX "idx_decks_language_pair";--> statement-breakpoint
ALTER TABLE "decks" ADD COLUMN "validated_for_languages" varchar(10)[] DEFAULT '{}' NOT NULL;--> statement-breakpoint
ALTER TABLE "deck_terms" DROP COLUMN "position";--> statement-breakpoint
ALTER TABLE "decks" DROP COLUMN "language_pair_id";--> statement-breakpoint
ALTER TABLE "decks" DROP COLUMN "created_by";--> statement-breakpoint
ALTER TABLE "decks" ADD CONSTRAINT "unique_deck_name" UNIQUE("name");--> statement-breakpoint
ALTER TABLE "decks" ADD CONSTRAINT "validated_languages_check" CHECK (validated_for_languages <@ ARRAY['en', 'it']::varchar[]);

View file

@ -0,0 +1,587 @@
{
"id": "d6bed73d-ee69-44b1-a3ce-3ae25898a6f0",
"prevId": "5830ce3b-dc0e-44a7-83d6-bc74016ca4fd",
"version": "7",
"dialect": "postgresql",
"tables": {
"public.deck_terms": {
"name": "deck_terms",
"schema": "",
"columns": {
"deck_id": {
"name": "deck_id",
"type": "uuid",
"primaryKey": false,
"notNull": true
},
"term_id": {
"name": "term_id",
"type": "uuid",
"primaryKey": false,
"notNull": true
},
"added_at": {
"name": "added_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": true,
"default": "now()"
}
},
"indexes": {
"idx_deck_terms_term": {
"name": "idx_deck_terms_term",
"columns": [
{
"expression": "term_id",
"isExpression": false,
"asc": true,
"nulls": "last"
}
],
"isUnique": false,
"concurrently": false,
"method": "btree",
"with": {}
}
},
"foreignKeys": {
"deck_terms_deck_id_decks_id_fk": {
"name": "deck_terms_deck_id_decks_id_fk",
"tableFrom": "deck_terms",
"tableTo": "decks",
"columnsFrom": [
"deck_id"
],
"columnsTo": [
"id"
],
"onDelete": "cascade",
"onUpdate": "no action"
},
"deck_terms_term_id_terms_id_fk": {
"name": "deck_terms_term_id_terms_id_fk",
"tableFrom": "deck_terms",
"tableTo": "terms",
"columnsFrom": [
"term_id"
],
"columnsTo": [
"id"
],
"onDelete": "cascade",
"onUpdate": "no action"
}
},
"compositePrimaryKeys": {
"deck_terms_deck_id_term_id_pk": {
"name": "deck_terms_deck_id_term_id_pk",
"columns": [
"deck_id",
"term_id"
]
}
},
"uniqueConstraints": {},
"policies": {},
"checkConstraints": {},
"isRLSEnabled": false
},
"public.decks": {
"name": "decks",
"schema": "",
"columns": {
"id": {
"name": "id",
"type": "uuid",
"primaryKey": true,
"notNull": true,
"default": "gen_random_uuid()"
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": true
},
"description": {
"name": "description",
"type": "text",
"primaryKey": false,
"notNull": false
},
"validated_for_languages": {
"name": "validated_for_languages",
"type": "varchar(10)[]",
"primaryKey": false,
"notNull": true,
"default": "'{}'"
},
"is_public": {
"name": "is_public",
"type": "boolean",
"primaryKey": false,
"notNull": true,
"default": false
},
"created_at": {
"name": "created_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": true,
"default": "now()"
}
},
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {
"unique_deck_name": {
"name": "unique_deck_name",
"nullsNotDistinct": false,
"columns": [
"name"
]
}
},
"policies": {},
"checkConstraints": {
"validated_languages_check": {
"name": "validated_languages_check",
"value": "validated_for_languages <@ ARRAY['en', 'it']::varchar[]"
}
},
"isRLSEnabled": false
},
"public.language_pairs": {
"name": "language_pairs",
"schema": "",
"columns": {
"id": {
"name": "id",
"type": "uuid",
"primaryKey": true,
"notNull": true,
"default": "gen_random_uuid()"
},
"source_language": {
"name": "source_language",
"type": "varchar(10)",
"primaryKey": false,
"notNull": true
},
"target_language": {
"name": "target_language",
"type": "varchar(10)",
"primaryKey": false,
"notNull": true
},
"label": {
"name": "label",
"type": "text",
"primaryKey": false,
"notNull": false
},
"active": {
"name": "active",
"type": "boolean",
"primaryKey": false,
"notNull": true,
"default": true
},
"created_at": {
"name": "created_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": true,
"default": "now()"
}
},
"indexes": {
"idx_pairs_active": {
"name": "idx_pairs_active",
"columns": [
{
"expression": "active",
"isExpression": false,
"asc": true,
"nulls": "last"
},
{
"expression": "source_language",
"isExpression": false,
"asc": true,
"nulls": "last"
},
{
"expression": "target_language",
"isExpression": false,
"asc": true,
"nulls": "last"
}
],
"isUnique": false,
"concurrently": false,
"method": "btree",
"with": {}
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {
"unique_source_target": {
"name": "unique_source_target",
"nullsNotDistinct": false,
"columns": [
"source_language",
"target_language"
]
}
},
"policies": {},
"checkConstraints": {
"source_language_check": {
"name": "source_language_check",
"value": "\"language_pairs\".\"source_language\" IN ('en', 'it')"
},
"target_language_check": {
"name": "target_language_check",
"value": "\"language_pairs\".\"target_language\" IN ('en', 'it')"
},
"no_self_pair": {
"name": "no_self_pair",
"value": "\"language_pairs\".\"source_language\" != \"language_pairs\".\"target_language\""
}
},
"isRLSEnabled": false
},
"public.term_glosses": {
"name": "term_glosses",
"schema": "",
"columns": {
"id": {
"name": "id",
"type": "uuid",
"primaryKey": true,
"notNull": true,
"default": "gen_random_uuid()"
},
"term_id": {
"name": "term_id",
"type": "uuid",
"primaryKey": false,
"notNull": true
},
"language_code": {
"name": "language_code",
"type": "varchar(10)",
"primaryKey": false,
"notNull": true
},
"text": {
"name": "text",
"type": "text",
"primaryKey": false,
"notNull": true
},
"created_at": {
"name": "created_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": true,
"default": "now()"
}
},
"indexes": {
"idx_term_glosses_term": {
"name": "idx_term_glosses_term",
"columns": [
{
"expression": "term_id",
"isExpression": false,
"asc": true,
"nulls": "last"
}
],
"isUnique": false,
"concurrently": false,
"method": "btree",
"with": {}
}
},
"foreignKeys": {
"term_glosses_term_id_terms_id_fk": {
"name": "term_glosses_term_id_terms_id_fk",
"tableFrom": "term_glosses",
"tableTo": "terms",
"columnsFrom": [
"term_id"
],
"columnsTo": [
"id"
],
"onDelete": "cascade",
"onUpdate": "no action"
}
},
"compositePrimaryKeys": {},
"uniqueConstraints": {
"unique_term_gloss": {
"name": "unique_term_gloss",
"nullsNotDistinct": false,
"columns": [
"term_id",
"language_code",
"text"
]
}
},
"policies": {},
"checkConstraints": {},
"isRLSEnabled": false
},
"public.terms": {
"name": "terms",
"schema": "",
"columns": {
"id": {
"name": "id",
"type": "uuid",
"primaryKey": true,
"notNull": true,
"default": "gen_random_uuid()"
},
"synset_id": {
"name": "synset_id",
"type": "text",
"primaryKey": false,
"notNull": true
},
"pos": {
"name": "pos",
"type": "varchar(20)",
"primaryKey": false,
"notNull": true
},
"created_at": {
"name": "created_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": true,
"default": "now()"
}
},
"indexes": {
"idx_terms_pos": {
"name": "idx_terms_pos",
"columns": [
{
"expression": "pos",
"isExpression": false,
"asc": true,
"nulls": "last"
}
],
"isUnique": false,
"concurrently": false,
"method": "btree",
"with": {}
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {
"terms_synset_id_unique": {
"name": "terms_synset_id_unique",
"nullsNotDistinct": false,
"columns": [
"synset_id"
]
}
},
"policies": {},
"checkConstraints": {
"pos_check": {
"name": "pos_check",
"value": "\"terms\".\"pos\" IN ('noun')"
}
},
"isRLSEnabled": false
},
"public.translations": {
"name": "translations",
"schema": "",
"columns": {
"id": {
"name": "id",
"type": "uuid",
"primaryKey": true,
"notNull": true,
"default": "gen_random_uuid()"
},
"term_id": {
"name": "term_id",
"type": "uuid",
"primaryKey": false,
"notNull": true
},
"language_code": {
"name": "language_code",
"type": "varchar(10)",
"primaryKey": false,
"notNull": true
},
"text": {
"name": "text",
"type": "text",
"primaryKey": false,
"notNull": true
},
"created_at": {
"name": "created_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": true,
"default": "now()"
}
},
"indexes": {
"idx_translations_lang": {
"name": "idx_translations_lang",
"columns": [
{
"expression": "language_code",
"isExpression": false,
"asc": true,
"nulls": "last"
},
{
"expression": "term_id",
"isExpression": false,
"asc": true,
"nulls": "last"
}
],
"isUnique": false,
"concurrently": false,
"method": "btree",
"with": {}
}
},
"foreignKeys": {
"translations_term_id_terms_id_fk": {
"name": "translations_term_id_terms_id_fk",
"tableFrom": "translations",
"tableTo": "terms",
"columnsFrom": [
"term_id"
],
"columnsTo": [
"id"
],
"onDelete": "cascade",
"onUpdate": "no action"
}
},
"compositePrimaryKeys": {},
"uniqueConstraints": {
"unique_translations": {
"name": "unique_translations",
"nullsNotDistinct": false,
"columns": [
"term_id",
"language_code",
"text"
]
}
},
"policies": {},
"checkConstraints": {},
"isRLSEnabled": false
},
"public.users": {
"name": "users",
"schema": "",
"columns": {
"id": {
"name": "id",
"type": "uuid",
"primaryKey": true,
"notNull": true,
"default": "gen_random_uuid()"
},
"openauth_sub": {
"name": "openauth_sub",
"type": "text",
"primaryKey": false,
"notNull": true
},
"email": {
"name": "email",
"type": "varchar(255)",
"primaryKey": false,
"notNull": false
},
"display_name": {
"name": "display_name",
"type": "varchar(100)",
"primaryKey": false,
"notNull": false
},
"created_at": {
"name": "created_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": true,
"default": "now()"
},
"last_login_at": {
"name": "last_login_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": false
}
},
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {
"users_openauth_sub_unique": {
"name": "users_openauth_sub_unique",
"nullsNotDistinct": false,
"columns": [
"openauth_sub"
]
},
"users_email_unique": {
"name": "users_email_unique",
"nullsNotDistinct": false,
"columns": [
"email"
]
},
"users_display_name_unique": {
"name": "users_display_name_unique",
"nullsNotDistinct": false,
"columns": [
"display_name"
]
}
},
"policies": {},
"checkConstraints": {},
"isRLSEnabled": false
}
},
"enums": {},
"schemas": {},
"sequences": {},
"roles": {},
"policies": {},
"views": {},
"_meta": {
"columns": {},
"schemas": {},
"tables": {}
}
}

View file

@ -8,6 +8,13 @@
"when": 1774721919883, "when": 1774721919883,
"tag": "0000_bitter_turbo", "tag": "0000_bitter_turbo",
"breakpoints": true "breakpoints": true
},
{
"idx": 1,
"version": "7",
"when": 1774970553186,
"tag": "0001_medical_fabian_cortez",
"breakpoints": true
} }
] ]
} }

View file

@ -5,6 +5,7 @@
"type": "module", "type": "module",
"scripts": { "scripts": {
"build": "tsc", "build": "tsc",
"generate": "drizzle-kit generate",
"migrate": "drizzle-kit migrate" "migrate": "drizzle-kit migrate"
}, },
"dependencies": { "dependencies": {

View file

@ -1,55 +0,0 @@
import fs from "node:fs/promises";
import { db } from "@glossa/db";
import { translations } from "@glossa/db/schema";
import { inArray } from "drizzle-orm";
const wordlistPath = "./src/data/wordlists/top1000englishnouns";
const unmatchedOutputPath =
"./src/data/wordlists/top1000englishnouns-unmatched";
const main = async () => {
// 1. Read and normalise the word list
console.log("📖 Reading word list...");
const raw = await fs.readFile(wordlistPath, "utf8");
const words = raw
.split("\n")
.map((w) => w.trim().toLowerCase())
.filter(Boolean);
console.log(` ${words.length} words loaded\n`);
// 2. Query DB for matches
console.log("🔍 Checking against database...");
const rows = await db
.select({ text: translations.text })
.from(translations)
.where(inArray(translations.text, words));
const matchedSet = new Set(rows.map((r) => r.text.toLowerCase()));
// 3. Split into matched / unmatched
const matched = words.filter((w) => matchedSet.has(w));
const unmatched = words.filter((w) => !matchedSet.has(w));
// 4. Terminal output
console.log(`✅ Matched: ${matched.length}/${words.length}`);
console.log(`❌ Unmatched: ${unmatched.length}/${words.length}`);
console.log(
`📊 Coverage: ${((matched.length / words.length) * 100).toFixed(1)}%\n`,
);
if (unmatched.length > 0) {
console.log("❌ Unmatched words:");
for (const w of unmatched) {
console.log(` ${w}`);
}
}
// 5. Write unmatched to file
// await fs.writeFile(unmatchedOutputPath, unmatched.join("\n"), "utf8");
console.log(`\n💾 Unmatched words written to ${unmatchedOutputPath}`);
};
main().catch((error) => {
console.error(error);
process.exit(1);
});

View file

@ -1,46 +1,77 @@
/* /*
Parse CLI args resolve the word list file path - [x] Setup hardcoded path, name, description, source language, POS
Connect to the database - [x] Read wordlist load the 1000 nouns
Read the word list file into an ordered array of strings - [x] Query terms match to database, find which ones have translations
Look up the enit language pair ID from language_pairs - [ ] Validation determine validated languages from the data (e.g., ["en", "it"] if all matched terms have both)
Batch-fetch all matching rows from translations where language_code = 'en' and text IN (words) - [ ] Check idempotency skip if deck exists
Build a word termId map from the results - [ ] Create deck insert with discovered validated_for_languages
Walk the ordered word list split into hits (word found, capture position) and misses (skip) - [ ] Link terms insert deck_terms
Check if a deck with this name already exists if so, delete its deck_terms then the deck itself - [ ] Report summary
Insert the new decks row
Insert all deck_terms rows in batches (deckId, termId, position)
Log the skipped words
Close the DB connection
*/ */
import fs from "node:fs/promises"; import fs from "node:fs/promises";
import { db } from "@glossa/db"; import { db } from "@glossa/db";
import { translations } from "@glossa/db/schema"; import { translations, terms } from "@glossa/db/schema";
import { inArray } from "drizzle-orm"; import { inArray, and, eq } from "drizzle-orm";
const wordlistPath = "./src/data/wordlists/top1000englishnouns"; const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
const nameOfDeck = "top english nouns";
const descriptionOfDeck =
"Most frequently used English nouns for vocabulary practice";
const sourceLanguage = "en";
const sourcePOS = "noun";
const main = async () => { const readingFromWordlist = async () => {
// Read and normalise the word list const raw = await fs.readFile(pathToWordlist, "utf8");
console.log("📖 Reading word list...");
const raw = await fs.readFile(wordlistPath, "utf8");
const words = raw const words = raw
.split("\n") .split("\n")
.map((w) => w.trim().toLowerCase()) .map((w) => w.trim().toLowerCase())
.filter(Boolean); .filter(Boolean);
console.log(` ${words.length} words loaded\n`); return words;
};
// Query DB for matches const checkingSourceWordsAgainstDB = async (words: string[]) => {
console.log("🔍 Checking against database...");
const rows = await db const rows = await db
.select({ text: translations.text, termId: translations.term_id }) .select({ text: translations.text, termId: translations.term_id })
.from(translations) .from(translations)
.where(inArray(translations.text, words)); .innerJoin(terms, eq(translations.term_id, terms.id))
.where(
and(
inArray(translations.text, words),
eq(translations.language_code, sourceLanguage),
eq(terms.pos, sourcePOS),
),
);
const matchedSet = new Set(rows.map((r) => r.text.toLowerCase())); // map word text to term_id
const wordsInDb = words.filter((w) => matchedSet.has(w)); const wordToTermId = new Map<string, string>();
for (const row of rows) {
const word = row.text.toLowerCase();
if (!wordToTermId.has(word)) {
wordToTermId.set(word, row.termId);
}
}
const termIds = Array.from(wordToTermId.values());
const missingWords = words.filter((w) => !wordToTermId.has(w));
console.log("wordsInDb: ", wordsInDb); return { termIds, missingWords };
};
const writeMissingWordsToFile = async (words: string[]) => {};
const main = async () => {
// Read and normalise the word list
console.log("📖 Reading word list...");
const sourceWords = await readingFromWordlist();
console.log(` ${sourceWords.length} words loaded\n`);
// check if sourceWords exist in database
console.log("🔍 Checking against database...");
const { termIds, missingWords } =
await checkingSourceWordsAgainstDB(sourceWords);
console.log("words found in db: ", termIds.length);
console.log("words NOT found in db: ", missingWords.length);
// write missing words to file
await writeMissingWordsToFile(missingWords);
}; };
main().catch((error) => { main().catch((error) => {