adding datafiles and seeding script
This commit is contained in:
parent
068949b4cb
commit
20fa6a9331
7 changed files with 349852 additions and 3 deletions
55
packages/db/src/check-noun-coverage.ts
Normal file
55
packages/db/src/check-noun-coverage.ts
Normal file
|
|
@ -0,0 +1,55 @@
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import { db } from "@glossa/db";
|
||||||
|
import { translations } from "@glossa/db/schema";
|
||||||
|
import { inArray } from "drizzle-orm";
|
||||||
|
|
||||||
|
const wordlistPath = "./src/data/wordlists/top1000englishnouns";
|
||||||
|
const unmatchedOutputPath =
|
||||||
|
"./src/data/wordlists/top1000englishnouns-unmatched";
|
||||||
|
|
||||||
|
const main = async () => {
|
||||||
|
// 1. Read and normalise the word list
|
||||||
|
console.log("📖 Reading word list...");
|
||||||
|
const raw = await fs.readFile(wordlistPath, "utf8");
|
||||||
|
const words = raw
|
||||||
|
.split("\n")
|
||||||
|
.map((w) => w.trim().toLowerCase())
|
||||||
|
.filter(Boolean);
|
||||||
|
console.log(` ${words.length} words loaded\n`);
|
||||||
|
|
||||||
|
// 2. Query DB for matches
|
||||||
|
console.log("🔍 Checking against database...");
|
||||||
|
const rows = await db
|
||||||
|
.select({ text: translations.text })
|
||||||
|
.from(translations)
|
||||||
|
.where(inArray(translations.text, words));
|
||||||
|
|
||||||
|
const matchedSet = new Set(rows.map((r) => r.text.toLowerCase()));
|
||||||
|
|
||||||
|
// 3. Split into matched / unmatched
|
||||||
|
const matched = words.filter((w) => matchedSet.has(w));
|
||||||
|
const unmatched = words.filter((w) => !matchedSet.has(w));
|
||||||
|
|
||||||
|
// 4. Terminal output
|
||||||
|
console.log(`✅ Matched: ${matched.length}/${words.length}`);
|
||||||
|
console.log(`❌ Unmatched: ${unmatched.length}/${words.length}`);
|
||||||
|
console.log(
|
||||||
|
`📊 Coverage: ${((matched.length / words.length) * 100).toFixed(1)}%\n`,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (unmatched.length > 0) {
|
||||||
|
console.log("❌ Unmatched words:");
|
||||||
|
for (const w of unmatched) {
|
||||||
|
console.log(` ${w}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. Write unmatched to file
|
||||||
|
// await fs.writeFile(unmatchedOutputPath, unmatched.join("\n"), "utf8");
|
||||||
|
console.log(`\n💾 Unmatched words written to ${unmatchedOutputPath}`);
|
||||||
|
};
|
||||||
|
|
||||||
|
main().catch((error) => {
|
||||||
|
console.error(error);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
49
packages/db/src/create-test-deck.ts
Normal file
49
packages/db/src/create-test-deck.ts
Normal file
|
|
@ -0,0 +1,49 @@
|
||||||
|
/*
|
||||||
|
Parse CLI args → resolve the word list file path
|
||||||
|
Connect to the database
|
||||||
|
Read the word list file into an ordered array of strings
|
||||||
|
Look up the en→it language pair ID from language_pairs
|
||||||
|
Batch-fetch all matching rows from translations where language_code = 'en' and text IN (words)
|
||||||
|
Build a word → termId map from the results
|
||||||
|
Walk the ordered word list → split into hits (word found, capture position) and misses (skip)
|
||||||
|
Check if a deck with this name already exists → if so, delete its deck_terms then the deck itself
|
||||||
|
Insert the new decks row
|
||||||
|
Insert all deck_terms rows in batches (deckId, termId, position)
|
||||||
|
Log the skipped words
|
||||||
|
Close the DB connection
|
||||||
|
*/
|
||||||
|
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import { db } from "@glossa/db";
|
||||||
|
import { translations } from "@glossa/db/schema";
|
||||||
|
import { inArray, and, eq } from "drizzle-orm";
|
||||||
|
|
||||||
|
const wordlistPath = "./src/data/wordlists/top1000englishnouns";
|
||||||
|
|
||||||
|
const main = async () => {
|
||||||
|
// Read and normalise the word list
|
||||||
|
console.log("📖 Reading word list...");
|
||||||
|
const raw = await fs.readFile(wordlistPath, "utf8");
|
||||||
|
const words = raw
|
||||||
|
.split("\n")
|
||||||
|
.map((w) => w.trim().toLowerCase())
|
||||||
|
.filter(Boolean);
|
||||||
|
console.log(` ${words.length} words loaded\n`);
|
||||||
|
|
||||||
|
// Query DB for matches
|
||||||
|
console.log("🔍 Checking against database...");
|
||||||
|
const rows = await db
|
||||||
|
.select({ text: translations.text, termId: translations.term_id })
|
||||||
|
.from(translations)
|
||||||
|
.where(inArray(translations.text, words));
|
||||||
|
|
||||||
|
const matchedSet = new Set(rows.map((r) => r.text.toLowerCase()));
|
||||||
|
const wordsInDb = words.filter((w) => matchedSet.has(w));
|
||||||
|
|
||||||
|
console.log("wordsInDb: ", wordsInDb);
|
||||||
|
};
|
||||||
|
|
||||||
|
main().catch((error) => {
|
||||||
|
console.error(error);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
348711
packages/db/src/data/datafiles/en-it-noun.json
Normal file
348711
packages/db/src/data/datafiles/en-it-noun.json
Normal file
File diff suppressed because it is too large
Load diff
1000
packages/db/src/data/wordlists/top1000englishnouns
Normal file
1000
packages/db/src/data/wordlists/top1000englishnouns
Normal file
File diff suppressed because it is too large
Load diff
34
packages/db/src/data/wordlists/top1000englishnouns-unmatched
Normal file
34
packages/db/src/data/wordlists/top1000englishnouns-unmatched
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
a
|
||||||
|
other
|
||||||
|
us
|
||||||
|
may
|
||||||
|
st
|
||||||
|
paul
|
||||||
|
new
|
||||||
|
software
|
||||||
|
oxford
|
||||||
|
english
|
||||||
|
mary
|
||||||
|
japan
|
||||||
|
while
|
||||||
|
pp
|
||||||
|
membership
|
||||||
|
manchester
|
||||||
|
tony
|
||||||
|
alan
|
||||||
|
jones
|
||||||
|
un
|
||||||
|
northern
|
||||||
|
simon
|
||||||
|
behalf
|
||||||
|
co
|
||||||
|
graham
|
||||||
|
joe
|
||||||
|
guy
|
||||||
|
lewis
|
||||||
|
jane
|
||||||
|
taylor
|
||||||
|
co-operation
|
||||||
|
travel
|
||||||
|
self
|
||||||
|
thatcher
|
||||||
|
|
@ -21,7 +21,7 @@ type FileName = {
|
||||||
pos: POS;
|
pos: POS;
|
||||||
};
|
};
|
||||||
|
|
||||||
const dataDir = "../../scripts/datafiles/";
|
const dataDir = "./src/data/datafiles/";
|
||||||
|
|
||||||
const parseFilename = (filename: string): FileName => {
|
const parseFilename = (filename: string): FileName => {
|
||||||
const parts = filename.replace(".json", "").split("-");
|
const parts = filename.replace(".json", "").split("-");
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@
|
||||||
"moduleResolution": "NodeNext",
|
"moduleResolution": "NodeNext",
|
||||||
"outDir": "./dist",
|
"outDir": "./dist",
|
||||||
"resolveJsonModule": true,
|
"resolveJsonModule": true,
|
||||||
"types": ["vitest/globals"]
|
"types": ["vitest/globals"],
|
||||||
},
|
},
|
||||||
"include": ["src", "vitest.config.ts"]
|
"include": ["src", "vitest.config.ts"],
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue