87 lines
2.4 KiB
TypeScript
87 lines
2.4 KiB
TypeScript
import Database from "better-sqlite3";
|
|
import path from "node:path";
|
|
import fs from "node:fs";
|
|
import { fileURLToPath } from "node:url";
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
const DB_PATH = path.join(__dirname, "db/pipeline.db");
|
|
|
|
const db = new Database(DB_PATH, { readonly: true });
|
|
|
|
// Pull 50 synsets: ~12 per POS, all must have German translations
|
|
const synsets = db
|
|
.prepare(
|
|
`
|
|
SELECT DISTINCT s.source_id, s.pos
|
|
FROM synsets s
|
|
JOIN translations t ON t.source_id = s.source_id
|
|
WHERE t.language = 'de'
|
|
ORDER BY RANDOM()
|
|
LIMIT 50
|
|
`,
|
|
)
|
|
.all() as { source_id: string; pos: string }[];
|
|
|
|
const results: string[] = [];
|
|
let index = 0;
|
|
|
|
for (const synset of synsets) {
|
|
index++;
|
|
|
|
const glosses = db
|
|
.prepare("SELECT language, text FROM glosses WHERE source_id = ?")
|
|
.all(synset.source_id) as { language: string; text: string }[];
|
|
|
|
const enGloss = glosses.find((g) => g.language === "en")?.text ?? "—";
|
|
const deGloss = glosses.find((g) => g.language === "de")?.text ?? "—";
|
|
|
|
const deTranslations = db
|
|
.prepare(
|
|
"SELECT word FROM translations WHERE source_id = ? AND language = 'de'",
|
|
)
|
|
.all(synset.source_id) as { word: string }[];
|
|
|
|
const enTranslations = db
|
|
.prepare(
|
|
"SELECT word FROM translations WHERE source_id = ? AND language = 'en'",
|
|
)
|
|
.all(synset.source_id) as { word: string }[];
|
|
|
|
const deWords = deTranslations.map((t) => t.word);
|
|
const enWords = enTranslations.map((t) => t.word);
|
|
|
|
results.push(
|
|
[
|
|
`${String(index).padStart(2, " ")}. [${synset.pos}] ${synset.source_id}`,
|
|
` EN gloss: ${enGloss}`,
|
|
` DE gloss: ${deGloss}`,
|
|
` EN words: ${enWords.join(", ")}`,
|
|
` DE words: ${deWords.join(", ")}`,
|
|
` QUALITY: ___`,
|
|
``,
|
|
].join("\n"),
|
|
);
|
|
}
|
|
|
|
const output = [
|
|
"# OMW German Translation Quality Audit",
|
|
"",
|
|
"Instructions: for each entry, check if the German translations",
|
|
"match the meaning described by the English gloss.",
|
|
"",
|
|
"Mark QUALITY as:",
|
|
" OK — all German translations fit the meaning",
|
|
" PARTIAL — some fit, some don't",
|
|
" BAD — none of the German translations fit",
|
|
" USELESS — translations are correct but useless for learners",
|
|
"",
|
|
"---",
|
|
"",
|
|
...results,
|
|
].join("\n");
|
|
|
|
const outPath = path.join(__dirname, "audit.md");
|
|
fs.writeFileSync(outPath, output, "utf-8");
|
|
console.log(`Wrote ${synsets.length} entries → ${outPath}`);
|
|
|
|
db.close();
|