This commit is contained in:
lila 2026-03-31 18:28:29 +02:00
parent 9d1a82bdf0
commit 488f0dab11
6 changed files with 666 additions and 82 deletions

View file

@ -0,0 +1,13 @@
ALTER TABLE "decks" DROP CONSTRAINT "unique_deck_name";--> statement-breakpoint
ALTER TABLE "decks" DROP CONSTRAINT "decks_language_pair_id_language_pairs_id_fk";
--> statement-breakpoint
ALTER TABLE "decks" DROP CONSTRAINT "decks_created_by_users_id_fk";
--> statement-breakpoint
DROP INDEX "idx_decks_created_by";--> statement-breakpoint
DROP INDEX "idx_decks_language_pair";--> statement-breakpoint
ALTER TABLE "decks" ADD COLUMN "validated_for_languages" varchar(10)[] DEFAULT '{}' NOT NULL;--> statement-breakpoint
ALTER TABLE "deck_terms" DROP COLUMN "position";--> statement-breakpoint
ALTER TABLE "decks" DROP COLUMN "language_pair_id";--> statement-breakpoint
ALTER TABLE "decks" DROP COLUMN "created_by";--> statement-breakpoint
ALTER TABLE "decks" ADD CONSTRAINT "unique_deck_name" UNIQUE("name");--> statement-breakpoint
ALTER TABLE "decks" ADD CONSTRAINT "validated_languages_check" CHECK (validated_for_languages <@ ARRAY['en', 'it']::varchar[]);

View file

@ -0,0 +1,587 @@
{
"id": "d6bed73d-ee69-44b1-a3ce-3ae25898a6f0",
"prevId": "5830ce3b-dc0e-44a7-83d6-bc74016ca4fd",
"version": "7",
"dialect": "postgresql",
"tables": {
"public.deck_terms": {
"name": "deck_terms",
"schema": "",
"columns": {
"deck_id": {
"name": "deck_id",
"type": "uuid",
"primaryKey": false,
"notNull": true
},
"term_id": {
"name": "term_id",
"type": "uuid",
"primaryKey": false,
"notNull": true
},
"added_at": {
"name": "added_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": true,
"default": "now()"
}
},
"indexes": {
"idx_deck_terms_term": {
"name": "idx_deck_terms_term",
"columns": [
{
"expression": "term_id",
"isExpression": false,
"asc": true,
"nulls": "last"
}
],
"isUnique": false,
"concurrently": false,
"method": "btree",
"with": {}
}
},
"foreignKeys": {
"deck_terms_deck_id_decks_id_fk": {
"name": "deck_terms_deck_id_decks_id_fk",
"tableFrom": "deck_terms",
"tableTo": "decks",
"columnsFrom": [
"deck_id"
],
"columnsTo": [
"id"
],
"onDelete": "cascade",
"onUpdate": "no action"
},
"deck_terms_term_id_terms_id_fk": {
"name": "deck_terms_term_id_terms_id_fk",
"tableFrom": "deck_terms",
"tableTo": "terms",
"columnsFrom": [
"term_id"
],
"columnsTo": [
"id"
],
"onDelete": "cascade",
"onUpdate": "no action"
}
},
"compositePrimaryKeys": {
"deck_terms_deck_id_term_id_pk": {
"name": "deck_terms_deck_id_term_id_pk",
"columns": [
"deck_id",
"term_id"
]
}
},
"uniqueConstraints": {},
"policies": {},
"checkConstraints": {},
"isRLSEnabled": false
},
"public.decks": {
"name": "decks",
"schema": "",
"columns": {
"id": {
"name": "id",
"type": "uuid",
"primaryKey": true,
"notNull": true,
"default": "gen_random_uuid()"
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": true
},
"description": {
"name": "description",
"type": "text",
"primaryKey": false,
"notNull": false
},
"validated_for_languages": {
"name": "validated_for_languages",
"type": "varchar(10)[]",
"primaryKey": false,
"notNull": true,
"default": "'{}'"
},
"is_public": {
"name": "is_public",
"type": "boolean",
"primaryKey": false,
"notNull": true,
"default": false
},
"created_at": {
"name": "created_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": true,
"default": "now()"
}
},
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {
"unique_deck_name": {
"name": "unique_deck_name",
"nullsNotDistinct": false,
"columns": [
"name"
]
}
},
"policies": {},
"checkConstraints": {
"validated_languages_check": {
"name": "validated_languages_check",
"value": "validated_for_languages <@ ARRAY['en', 'it']::varchar[]"
}
},
"isRLSEnabled": false
},
"public.language_pairs": {
"name": "language_pairs",
"schema": "",
"columns": {
"id": {
"name": "id",
"type": "uuid",
"primaryKey": true,
"notNull": true,
"default": "gen_random_uuid()"
},
"source_language": {
"name": "source_language",
"type": "varchar(10)",
"primaryKey": false,
"notNull": true
},
"target_language": {
"name": "target_language",
"type": "varchar(10)",
"primaryKey": false,
"notNull": true
},
"label": {
"name": "label",
"type": "text",
"primaryKey": false,
"notNull": false
},
"active": {
"name": "active",
"type": "boolean",
"primaryKey": false,
"notNull": true,
"default": true
},
"created_at": {
"name": "created_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": true,
"default": "now()"
}
},
"indexes": {
"idx_pairs_active": {
"name": "idx_pairs_active",
"columns": [
{
"expression": "active",
"isExpression": false,
"asc": true,
"nulls": "last"
},
{
"expression": "source_language",
"isExpression": false,
"asc": true,
"nulls": "last"
},
{
"expression": "target_language",
"isExpression": false,
"asc": true,
"nulls": "last"
}
],
"isUnique": false,
"concurrently": false,
"method": "btree",
"with": {}
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {
"unique_source_target": {
"name": "unique_source_target",
"nullsNotDistinct": false,
"columns": [
"source_language",
"target_language"
]
}
},
"policies": {},
"checkConstraints": {
"source_language_check": {
"name": "source_language_check",
"value": "\"language_pairs\".\"source_language\" IN ('en', 'it')"
},
"target_language_check": {
"name": "target_language_check",
"value": "\"language_pairs\".\"target_language\" IN ('en', 'it')"
},
"no_self_pair": {
"name": "no_self_pair",
"value": "\"language_pairs\".\"source_language\" != \"language_pairs\".\"target_language\""
}
},
"isRLSEnabled": false
},
"public.term_glosses": {
"name": "term_glosses",
"schema": "",
"columns": {
"id": {
"name": "id",
"type": "uuid",
"primaryKey": true,
"notNull": true,
"default": "gen_random_uuid()"
},
"term_id": {
"name": "term_id",
"type": "uuid",
"primaryKey": false,
"notNull": true
},
"language_code": {
"name": "language_code",
"type": "varchar(10)",
"primaryKey": false,
"notNull": true
},
"text": {
"name": "text",
"type": "text",
"primaryKey": false,
"notNull": true
},
"created_at": {
"name": "created_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": true,
"default": "now()"
}
},
"indexes": {
"idx_term_glosses_term": {
"name": "idx_term_glosses_term",
"columns": [
{
"expression": "term_id",
"isExpression": false,
"asc": true,
"nulls": "last"
}
],
"isUnique": false,
"concurrently": false,
"method": "btree",
"with": {}
}
},
"foreignKeys": {
"term_glosses_term_id_terms_id_fk": {
"name": "term_glosses_term_id_terms_id_fk",
"tableFrom": "term_glosses",
"tableTo": "terms",
"columnsFrom": [
"term_id"
],
"columnsTo": [
"id"
],
"onDelete": "cascade",
"onUpdate": "no action"
}
},
"compositePrimaryKeys": {},
"uniqueConstraints": {
"unique_term_gloss": {
"name": "unique_term_gloss",
"nullsNotDistinct": false,
"columns": [
"term_id",
"language_code",
"text"
]
}
},
"policies": {},
"checkConstraints": {},
"isRLSEnabled": false
},
"public.terms": {
"name": "terms",
"schema": "",
"columns": {
"id": {
"name": "id",
"type": "uuid",
"primaryKey": true,
"notNull": true,
"default": "gen_random_uuid()"
},
"synset_id": {
"name": "synset_id",
"type": "text",
"primaryKey": false,
"notNull": true
},
"pos": {
"name": "pos",
"type": "varchar(20)",
"primaryKey": false,
"notNull": true
},
"created_at": {
"name": "created_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": true,
"default": "now()"
}
},
"indexes": {
"idx_terms_pos": {
"name": "idx_terms_pos",
"columns": [
{
"expression": "pos",
"isExpression": false,
"asc": true,
"nulls": "last"
}
],
"isUnique": false,
"concurrently": false,
"method": "btree",
"with": {}
}
},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {
"terms_synset_id_unique": {
"name": "terms_synset_id_unique",
"nullsNotDistinct": false,
"columns": [
"synset_id"
]
}
},
"policies": {},
"checkConstraints": {
"pos_check": {
"name": "pos_check",
"value": "\"terms\".\"pos\" IN ('noun')"
}
},
"isRLSEnabled": false
},
"public.translations": {
"name": "translations",
"schema": "",
"columns": {
"id": {
"name": "id",
"type": "uuid",
"primaryKey": true,
"notNull": true,
"default": "gen_random_uuid()"
},
"term_id": {
"name": "term_id",
"type": "uuid",
"primaryKey": false,
"notNull": true
},
"language_code": {
"name": "language_code",
"type": "varchar(10)",
"primaryKey": false,
"notNull": true
},
"text": {
"name": "text",
"type": "text",
"primaryKey": false,
"notNull": true
},
"created_at": {
"name": "created_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": true,
"default": "now()"
}
},
"indexes": {
"idx_translations_lang": {
"name": "idx_translations_lang",
"columns": [
{
"expression": "language_code",
"isExpression": false,
"asc": true,
"nulls": "last"
},
{
"expression": "term_id",
"isExpression": false,
"asc": true,
"nulls": "last"
}
],
"isUnique": false,
"concurrently": false,
"method": "btree",
"with": {}
}
},
"foreignKeys": {
"translations_term_id_terms_id_fk": {
"name": "translations_term_id_terms_id_fk",
"tableFrom": "translations",
"tableTo": "terms",
"columnsFrom": [
"term_id"
],
"columnsTo": [
"id"
],
"onDelete": "cascade",
"onUpdate": "no action"
}
},
"compositePrimaryKeys": {},
"uniqueConstraints": {
"unique_translations": {
"name": "unique_translations",
"nullsNotDistinct": false,
"columns": [
"term_id",
"language_code",
"text"
]
}
},
"policies": {},
"checkConstraints": {},
"isRLSEnabled": false
},
"public.users": {
"name": "users",
"schema": "",
"columns": {
"id": {
"name": "id",
"type": "uuid",
"primaryKey": true,
"notNull": true,
"default": "gen_random_uuid()"
},
"openauth_sub": {
"name": "openauth_sub",
"type": "text",
"primaryKey": false,
"notNull": true
},
"email": {
"name": "email",
"type": "varchar(255)",
"primaryKey": false,
"notNull": false
},
"display_name": {
"name": "display_name",
"type": "varchar(100)",
"primaryKey": false,
"notNull": false
},
"created_at": {
"name": "created_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": true,
"default": "now()"
},
"last_login_at": {
"name": "last_login_at",
"type": "timestamp with time zone",
"primaryKey": false,
"notNull": false
}
},
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {
"users_openauth_sub_unique": {
"name": "users_openauth_sub_unique",
"nullsNotDistinct": false,
"columns": [
"openauth_sub"
]
},
"users_email_unique": {
"name": "users_email_unique",
"nullsNotDistinct": false,
"columns": [
"email"
]
},
"users_display_name_unique": {
"name": "users_display_name_unique",
"nullsNotDistinct": false,
"columns": [
"display_name"
]
}
},
"policies": {},
"checkConstraints": {},
"isRLSEnabled": false
}
},
"enums": {},
"schemas": {},
"sequences": {},
"roles": {},
"policies": {},
"views": {},
"_meta": {
"columns": {},
"schemas": {},
"tables": {}
}
}

View file

@ -8,6 +8,13 @@
"when": 1774721919883,
"tag": "0000_bitter_turbo",
"breakpoints": true
},
{
"idx": 1,
"version": "7",
"when": 1774970553186,
"tag": "0001_medical_fabian_cortez",
"breakpoints": true
}
]
}

View file

@ -5,6 +5,7 @@
"type": "module",
"scripts": {
"build": "tsc",
"generate": "drizzle-kit generate",
"migrate": "drizzle-kit migrate"
},
"dependencies": {

View file

@ -1,55 +0,0 @@
import fs from "node:fs/promises";
import { db } from "@glossa/db";
import { translations } from "@glossa/db/schema";
import { inArray } from "drizzle-orm";
const wordlistPath = "./src/data/wordlists/top1000englishnouns";
const unmatchedOutputPath =
"./src/data/wordlists/top1000englishnouns-unmatched";
const main = async () => {
// 1. Read and normalise the word list
console.log("📖 Reading word list...");
const raw = await fs.readFile(wordlistPath, "utf8");
const words = raw
.split("\n")
.map((w) => w.trim().toLowerCase())
.filter(Boolean);
console.log(` ${words.length} words loaded\n`);
// 2. Query DB for matches
console.log("🔍 Checking against database...");
const rows = await db
.select({ text: translations.text })
.from(translations)
.where(inArray(translations.text, words));
const matchedSet = new Set(rows.map((r) => r.text.toLowerCase()));
// 3. Split into matched / unmatched
const matched = words.filter((w) => matchedSet.has(w));
const unmatched = words.filter((w) => !matchedSet.has(w));
// 4. Terminal output
console.log(`✅ Matched: ${matched.length}/${words.length}`);
console.log(`❌ Unmatched: ${unmatched.length}/${words.length}`);
console.log(
`📊 Coverage: ${((matched.length / words.length) * 100).toFixed(1)}%\n`,
);
if (unmatched.length > 0) {
console.log("❌ Unmatched words:");
for (const w of unmatched) {
console.log(` ${w}`);
}
}
// 5. Write unmatched to file
// await fs.writeFile(unmatchedOutputPath, unmatched.join("\n"), "utf8");
console.log(`\n💾 Unmatched words written to ${unmatchedOutputPath}`);
};
main().catch((error) => {
console.error(error);
process.exit(1);
});

View file

@ -1,46 +1,77 @@
/*
Parse CLI args resolve the word list file path
Connect to the database
Read the word list file into an ordered array of strings
Look up the enit language pair ID from language_pairs
Batch-fetch all matching rows from translations where language_code = 'en' and text IN (words)
Build a word termId map from the results
Walk the ordered word list split into hits (word found, capture position) and misses (skip)
Check if a deck with this name already exists if so, delete its deck_terms then the deck itself
Insert the new decks row
Insert all deck_terms rows in batches (deckId, termId, position)
Log the skipped words
Close the DB connection
- [x] Setup hardcoded path, name, description, source language, POS
- [x] Read wordlist load the 1000 nouns
- [x] Query terms match to database, find which ones have translations
- [ ] Validation determine validated languages from the data (e.g., ["en", "it"] if all matched terms have both)
- [ ] Check idempotency skip if deck exists
- [ ] Create deck insert with discovered validated_for_languages
- [ ] Link terms insert deck_terms
- [ ] Report summary
*/
import fs from "node:fs/promises";
import { db } from "@glossa/db";
import { translations } from "@glossa/db/schema";
import { inArray } from "drizzle-orm";
import { translations, terms } from "@glossa/db/schema";
import { inArray, and, eq } from "drizzle-orm";
const wordlistPath = "./src/data/wordlists/top1000englishnouns";
const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
const nameOfDeck = "top english nouns";
const descriptionOfDeck =
"Most frequently used English nouns for vocabulary practice";
const sourceLanguage = "en";
const sourcePOS = "noun";
const main = async () => {
// Read and normalise the word list
console.log("📖 Reading word list...");
const raw = await fs.readFile(wordlistPath, "utf8");
const readingFromWordlist = async () => {
const raw = await fs.readFile(pathToWordlist, "utf8");
const words = raw
.split("\n")
.map((w) => w.trim().toLowerCase())
.filter(Boolean);
console.log(` ${words.length} words loaded\n`);
return words;
};
// Query DB for matches
console.log("🔍 Checking against database...");
const checkingSourceWordsAgainstDB = async (words: string[]) => {
const rows = await db
.select({ text: translations.text, termId: translations.term_id })
.from(translations)
.where(inArray(translations.text, words));
.innerJoin(terms, eq(translations.term_id, terms.id))
.where(
and(
inArray(translations.text, words),
eq(translations.language_code, sourceLanguage),
eq(terms.pos, sourcePOS),
),
);
const matchedSet = new Set(rows.map((r) => r.text.toLowerCase()));
const wordsInDb = words.filter((w) => matchedSet.has(w));
// map word text to term_id
const wordToTermId = new Map<string, string>();
for (const row of rows) {
const word = row.text.toLowerCase();
if (!wordToTermId.has(word)) {
wordToTermId.set(word, row.termId);
}
}
const termIds = Array.from(wordToTermId.values());
const missingWords = words.filter((w) => !wordToTermId.has(w));
console.log("wordsInDb: ", wordsInDb);
return { termIds, missingWords };
};
const writeMissingWordsToFile = async (words: string[]) => {};
const main = async () => {
// Read and normalise the word list
console.log("📖 Reading word list...");
const sourceWords = await readingFromWordlist();
console.log(` ${sourceWords.length} words loaded\n`);
// check if sourceWords exist in database
console.log("🔍 Checking against database...");
const { termIds, missingWords } =
await checkingSourceWordsAgainstDB(sourceWords);
console.log("words found in db: ", termIds.length);
console.log("words NOT found in db: ", missingWords.length);
// write missing words to file
await writeMissingWordsToFile(missingWords);
};
main().catch((error) => {