adding datafiles and seeding script

This commit is contained in:
lila 2026-03-31 10:05:36 +02:00
parent 068949b4cb
commit 20fa6a9331
7 changed files with 349852 additions and 3 deletions

View file

@ -0,0 +1,55 @@
import fs from "node:fs/promises";
import { db } from "@glossa/db";
import { translations } from "@glossa/db/schema";
import { inArray } from "drizzle-orm";
const wordlistPath = "./src/data/wordlists/top1000englishnouns";
const unmatchedOutputPath =
"./src/data/wordlists/top1000englishnouns-unmatched";
const main = async () => {
// 1. Read and normalise the word list
console.log("📖 Reading word list...");
const raw = await fs.readFile(wordlistPath, "utf8");
const words = raw
.split("\n")
.map((w) => w.trim().toLowerCase())
.filter(Boolean);
console.log(` ${words.length} words loaded\n`);
// 2. Query DB for matches
console.log("🔍 Checking against database...");
const rows = await db
.select({ text: translations.text })
.from(translations)
.where(inArray(translations.text, words));
const matchedSet = new Set(rows.map((r) => r.text.toLowerCase()));
// 3. Split into matched / unmatched
const matched = words.filter((w) => matchedSet.has(w));
const unmatched = words.filter((w) => !matchedSet.has(w));
// 4. Terminal output
console.log(`✅ Matched: ${matched.length}/${words.length}`);
console.log(`❌ Unmatched: ${unmatched.length}/${words.length}`);
console.log(
`📊 Coverage: ${((matched.length / words.length) * 100).toFixed(1)}%\n`,
);
if (unmatched.length > 0) {
console.log("❌ Unmatched words:");
for (const w of unmatched) {
console.log(` ${w}`);
}
}
// 5. Write unmatched to file
// await fs.writeFile(unmatchedOutputPath, unmatched.join("\n"), "utf8");
console.log(`\n💾 Unmatched words written to ${unmatchedOutputPath}`);
};
main().catch((error) => {
console.error(error);
process.exit(1);
});

View file

@ -0,0 +1,49 @@
/*
Parse CLI args resolve the word list file path
Connect to the database
Read the word list file into an ordered array of strings
Look up the enit language pair ID from language_pairs
Batch-fetch all matching rows from translations where language_code = 'en' and text IN (words)
Build a word termId map from the results
Walk the ordered word list split into hits (word found, capture position) and misses (skip)
Check if a deck with this name already exists if so, delete its deck_terms then the deck itself
Insert the new decks row
Insert all deck_terms rows in batches (deckId, termId, position)
Log the skipped words
Close the DB connection
*/
import fs from "node:fs/promises";
import { db } from "@glossa/db";
import { translations } from "@glossa/db/schema";
import { inArray, and, eq } from "drizzle-orm";
const wordlistPath = "./src/data/wordlists/top1000englishnouns";
const main = async () => {
// Read and normalise the word list
console.log("📖 Reading word list...");
const raw = await fs.readFile(wordlistPath, "utf8");
const words = raw
.split("\n")
.map((w) => w.trim().toLowerCase())
.filter(Boolean);
console.log(` ${words.length} words loaded\n`);
// Query DB for matches
console.log("🔍 Checking against database...");
const rows = await db
.select({ text: translations.text, termId: translations.term_id })
.from(translations)
.where(inArray(translations.text, words));
const matchedSet = new Set(rows.map((r) => r.text.toLowerCase()));
const wordsInDb = words.filter((w) => matchedSet.has(w));
console.log("wordsInDb: ", wordsInDb);
};
main().catch((error) => {
console.error(error);
process.exit(1);
});

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,34 @@
a
other
us
may
st
paul
new
software
oxford
english
mary
japan
while
pp
membership
manchester
tony
alan
jones
un
northern
simon
behalf
co
graham
joe
guy
lewis
jane
taylor
co-operation
travel
self
thatcher

View file

@ -21,7 +21,7 @@ type FileName = {
pos: POS;
};
const dataDir = "../../scripts/datafiles/";
const dataDir = "./src/data/datafiles/";
const parseFilename = (filename: string): FileName => {
const parts = filename.replace(".json", "").split("-");

View file

@ -5,7 +5,7 @@
"moduleResolution": "NodeNext",
"outDir": "./dist",
"resolveJsonModule": true,
"types": ["vitest/globals"]
"types": ["vitest/globals"],
},
"include": ["src", "vitest.config.ts"]
"include": ["src", "vitest.config.ts"],
}