wip
This commit is contained in:
parent
a49bce4a5a
commit
7fdcedd1dd
2 changed files with 106 additions and 35 deletions
|
|
@ -133,10 +133,7 @@ export const decks = pgTable(
|
||||||
name: text().notNull(),
|
name: text().notNull(),
|
||||||
description: text(),
|
description: text(),
|
||||||
source_language: varchar({ length: 10 }).notNull(),
|
source_language: varchar({ length: 10 }).notNull(),
|
||||||
validated_for_languages: varchar({ length: 10 })
|
validated_languages: varchar({ length: 10 }).array().notNull().default([]),
|
||||||
.array()
|
|
||||||
.notNull()
|
|
||||||
.default([]),
|
|
||||||
is_public: boolean().default(false).notNull(),
|
is_public: boolean().default(false).notNull(),
|
||||||
created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
|
created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -1,24 +1,42 @@
|
||||||
/*
|
/*
|
||||||
- [x] Setup — hardcoded path, name, description, source language, POS
|
*
|
||||||
- [x] Read wordlist — load the 1000 nouns
|
* Builds the "top English nouns" deck from a curated wordlist of the 1000 most
|
||||||
- [x] Query terms — match to database, find which ones have translations
|
* frequently used English nouns. The deck has English as its source language —
|
||||||
- [ ] writing missing words to textfile for future use
|
* meaning it was curated from an English-centric frequency list, and a separate
|
||||||
- [ ] Validation — determine validated languages from the data (e.g., ["en", "it"] if all matched terms have both)
|
* deck would be needed for other source languages. For each word in the list, all
|
||||||
- [ ] Check idempotency — skip if deck exists
|
* matching term IDs are looked up in the database via the translations table
|
||||||
- [ ] Create deck — insert with discovered validated_for_languages
|
* (language: "en", POS: "noun") — homonyms are intentionally included as separate
|
||||||
- [ ] Link terms — insert deck_terms
|
* cards since the quiz UI displays a gloss alongside each word. Words from the
|
||||||
- [ ] Report — summary
|
* list that have no DB match are skipped and written to a file for future
|
||||||
*/
|
* reference. The script is idempotent: if the deck already exists, only terms
|
||||||
|
* present in the source but missing from the deck are added; terms already in the
|
||||||
|
* deck are left untouched; terms in the deck but absent from the source are never
|
||||||
|
* removed. After resolving all matched terms, the script determines
|
||||||
|
* validated_for_languages by checking which languages — excluding the source
|
||||||
|
* language — have full translation coverage across all matched terms, and updates
|
||||||
|
* the array on every run.
|
||||||
|
*/
|
||||||
|
|
||||||
// TODO: Wordlist contains 1000 lines but only 999 unique words (965 found + 34 missing = 999).
|
/*
|
||||||
// Likely cause: duplicate entry in top1000englishnouns file.
|
* roadmap
|
||||||
// Investigate with: const unique = new Set(words); console.log(words.length - unique.size);
|
*
|
||||||
// Fix either by deduplicating in code ([...new Set(words)]) or cleaning the source file.
|
* [x] Setup — hardcoded path, name, description, source language, POS
|
||||||
|
* [x] Read wordlist — load and deduplicate the 1000 nouns
|
||||||
|
* [x] Query terms — match to database, collect all term IDs per word (including homonyms)
|
||||||
|
* [x] Write missing words to file for future reference
|
||||||
|
* [x] Determine validated_for_languages — find languages (excluding source) with full coverage across all matched terms
|
||||||
|
* [ ] Check idempotency — if deck exists, diff matched terms against existing deck_terms
|
||||||
|
* [ ] Create deck if it doesn't exist — insert with name, source_language, validated_for_languages
|
||||||
|
* [ ] Add new terms — insert only term IDs present in source but missing from deck
|
||||||
|
* [ ] Update validated_for_languages — recalculate and update on every run
|
||||||
|
* [ ] Report — summary of words found, missing, added, and validated languages
|
||||||
|
*/
|
||||||
|
|
||||||
import fs from "node:fs/promises";
|
import fs from "node:fs/promises";
|
||||||
import { db } from "@glossa/db";
|
import { db } from "@glossa/db";
|
||||||
import { translations, terms } from "@glossa/db/schema";
|
import { translations, terms, decks } from "@glossa/db/schema";
|
||||||
import { inArray, and, eq } from "drizzle-orm";
|
import { inArray, and, eq } from "drizzle-orm";
|
||||||
|
import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared";
|
||||||
|
|
||||||
const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
|
const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
|
||||||
const nameOfDeck = "top english nouns";
|
const nameOfDeck = "top english nouns";
|
||||||
|
|
@ -27,12 +45,20 @@ const descriptionOfDeck =
|
||||||
const sourceLanguage = "en";
|
const sourceLanguage = "en";
|
||||||
const sourcePOS = "noun";
|
const sourcePOS = "noun";
|
||||||
|
|
||||||
|
// new Set() automatically discards duplicate values,
|
||||||
|
// and spreading it back with ... converts it to a plain array again.
|
||||||
|
// So if "bank" appears twice in the file,
|
||||||
|
// the resulting array will only contain it once.
|
||||||
const readingFromWordlist = async () => {
|
const readingFromWordlist = async () => {
|
||||||
const raw = await fs.readFile(pathToWordlist, "utf8");
|
const raw = await fs.readFile(pathToWordlist, "utf8");
|
||||||
const words = raw
|
const words = [
|
||||||
.split("\n")
|
...new Set(
|
||||||
.map((w) => w.trim().toLowerCase())
|
raw
|
||||||
.filter(Boolean);
|
.split("\n")
|
||||||
|
.map((w) => w.trim().toLowerCase())
|
||||||
|
.filter(Boolean),
|
||||||
|
),
|
||||||
|
];
|
||||||
return words;
|
return words;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -49,37 +75,85 @@ const checkingSourceWordsAgainstDB = async (words: string[]) => {
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
// map word text to term_id
|
const wordToTermIds = new Map<string, string[]>();
|
||||||
const wordToTermId = new Map<string, string>();
|
|
||||||
for (const row of rows) {
|
for (const row of rows) {
|
||||||
const word = row.text.toLowerCase();
|
const word = row.text.toLowerCase();
|
||||||
if (!wordToTermId.has(word)) {
|
const existing = wordToTermIds.get(word) ?? [];
|
||||||
wordToTermId.set(word, row.termId);
|
wordToTermIds.set(word, [...existing, row.termId]);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
const termIds = Array.from(wordToTermId.values());
|
const termIds = Array.from(wordToTermIds.values()).flat();
|
||||||
const missingWords = words.filter((w) => !wordToTermId.has(w));
|
const missingWords = words.filter((w) => !wordToTermIds.has(w));
|
||||||
|
|
||||||
return { termIds, missingWords };
|
return { termIds, missingWords };
|
||||||
};
|
};
|
||||||
|
|
||||||
const writeMissingWordsToFile = async (words: string[]) => {};
|
const writeMissingWordsToFile = async (missingWords: string[]) => {
|
||||||
|
const outputPath = `${pathToWordlist}-missing`;
|
||||||
|
await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
|
||||||
|
};
|
||||||
|
|
||||||
|
const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
|
||||||
|
// create array of language code from the supported languages
|
||||||
|
// remove source language from it
|
||||||
|
const languages = SUPPORTED_LANGUAGE_CODES.filter(
|
||||||
|
(language) => language !== sourceLanguage,
|
||||||
|
);
|
||||||
|
const validatedLanguages: string[] = [];
|
||||||
|
// For each remaining language, count how many of the termIds have a translation in that language
|
||||||
|
for (const language of languages) {
|
||||||
|
const rows = await db
|
||||||
|
.select({ termId: translations.term_id })
|
||||||
|
.from(translations)
|
||||||
|
.where(
|
||||||
|
and(
|
||||||
|
inArray(translations.term_id, termIds),
|
||||||
|
eq(translations.language_code, language),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
if (rows.length === termIds.length) {
|
||||||
|
validatedLanguages.push(language);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the count equals termIds.length → full coverage → include in result
|
||||||
|
// Return the array of fully covered languages
|
||||||
|
return validatedLanguages;
|
||||||
|
};
|
||||||
|
|
||||||
|
const findExistingDeck = async () => {
|
||||||
|
const existing = await db
|
||||||
|
.select({ id: decks.id, validatedForLanguages: decks.validated_languages })
|
||||||
|
.from(decks)
|
||||||
|
.where(
|
||||||
|
and(
|
||||||
|
eq(decks.name, nameOfDeck),
|
||||||
|
eq(decks.source_language, sourceLanguage),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
return existing[0] ?? null;
|
||||||
|
};
|
||||||
|
|
||||||
const main = async () => {
|
const main = async () => {
|
||||||
// Read and normalise the word list
|
// reading from source file
|
||||||
console.log("📖 Reading word list...");
|
console.log("📖 Reading word list...");
|
||||||
const sourceWords = await readingFromWordlist();
|
const sourceWords = await readingFromWordlist();
|
||||||
console.log(` ${sourceWords.length} words loaded\n`);
|
console.log(` ${sourceWords.length} words loaded\n`);
|
||||||
|
|
||||||
// check if sourceWords exist in database
|
// checking if sourceWords exist in database
|
||||||
console.log("🔍 Checking against database...");
|
console.log("🔍 Checking against database...");
|
||||||
const { termIds, missingWords } =
|
const { termIds, missingWords } =
|
||||||
await checkingSourceWordsAgainstDB(sourceWords);
|
await checkingSourceWordsAgainstDB(sourceWords);
|
||||||
console.log("words found in db: ", termIds.length);
|
console.log("words found in db: ", termIds.length);
|
||||||
console.log("words NOT found in db: ", missingWords.length);
|
console.log("words NOT found in db: ", missingWords.length, "\n");
|
||||||
|
|
||||||
// write missing words to file
|
// writing missing words to file
|
||||||
|
console.log("writing missing words to file...\n");
|
||||||
await writeMissingWordsToFile(missingWords);
|
await writeMissingWordsToFile(missingWords);
|
||||||
|
|
||||||
|
// validating languages
|
||||||
|
console.log("validation languages...");
|
||||||
|
const validatedLanguages = await validateLanguages(sourceLanguage, termIds);
|
||||||
|
console.log("validated these languages: ", validatedLanguages, "\n");
|
||||||
};
|
};
|
||||||
|
|
||||||
main().catch((error) => {
|
main().catch((error) => {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue