lila/data-pipeline/audit.ts

87 lines
2.4 KiB
TypeScript

import Database from "better-sqlite3";
import path from "node:path";
import fs from "node:fs";
import { fileURLToPath } from "node:url";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const DB_PATH = path.join(__dirname, "db/pipeline.db");
const db = new Database(DB_PATH, { readonly: true });
// Pull 50 synsets: ~12 per POS, all must have German translations
const synsets = db
.prepare(
`
SELECT DISTINCT s.source_id, s.pos
FROM synsets s
JOIN translations t ON t.source_id = s.source_id
WHERE t.language = 'de'
ORDER BY RANDOM()
LIMIT 50
`,
)
.all() as { source_id: string; pos: string }[];
const results: string[] = [];
let index = 0;
for (const synset of synsets) {
index++;
const glosses = db
.prepare("SELECT language, text FROM glosses WHERE source_id = ?")
.all(synset.source_id) as { language: string; text: string }[];
const enGloss = glosses.find((g) => g.language === "en")?.text ?? "—";
const deGloss = glosses.find((g) => g.language === "de")?.text ?? "—";
const deTranslations = db
.prepare(
"SELECT word FROM translations WHERE source_id = ? AND language = 'de'",
)
.all(synset.source_id) as { word: string }[];
const enTranslations = db
.prepare(
"SELECT word FROM translations WHERE source_id = ? AND language = 'en'",
)
.all(synset.source_id) as { word: string }[];
const deWords = deTranslations.map((t) => t.word);
const enWords = enTranslations.map((t) => t.word);
results.push(
[
`${String(index).padStart(2, " ")}. [${synset.pos}] ${synset.source_id}`,
` EN gloss: ${enGloss}`,
` DE gloss: ${deGloss}`,
` EN words: ${enWords.join(", ")}`,
` DE words: ${deWords.join(", ")}`,
` QUALITY: ___`,
``,
].join("\n"),
);
}
const output = [
"# OMW German Translation Quality Audit",
"",
"Instructions: for each entry, check if the German translations",
"match the meaning described by the English gloss.",
"",
"Mark QUALITY as:",
" OK — all German translations fit the meaning",
" PARTIAL — some fit, some don't",
" BAD — none of the German translations fit",
" USELESS — translations are correct but useless for learners",
"",
"---",
"",
...results,
].join("\n");
const outPath = path.join(__dirname, "audit.md");
fs.writeFileSync(outPath, output, "utf-8");
console.log(`Wrote ${synsets.length} entries → ${outPath}`);
db.close();