docs: rewrite data-pipeline.md for Kaikki migration
This commit is contained in:
parent
87aeb072c5
commit
38d8b85228
4 changed files with 615 additions and 313 deletions
87
data-pipeline/audit.ts
Normal file
87
data-pipeline/audit.ts
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
import Database from "better-sqlite3";
|
||||
import path from "node:path";
|
||||
import fs from "node:fs";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const DB_PATH = path.join(__dirname, "db/pipeline.db");
|
||||
|
||||
const db = new Database(DB_PATH, { readonly: true });
|
||||
|
||||
// Pull 50 synsets: ~12 per POS, all must have German translations
|
||||
const synsets = db
|
||||
.prepare(
|
||||
`
|
||||
SELECT DISTINCT s.source_id, s.pos
|
||||
FROM synsets s
|
||||
JOIN translations t ON t.source_id = s.source_id
|
||||
WHERE t.language = 'de'
|
||||
ORDER BY RANDOM()
|
||||
LIMIT 50
|
||||
`,
|
||||
)
|
||||
.all() as { source_id: string; pos: string }[];
|
||||
|
||||
const results: string[] = [];
|
||||
let index = 0;
|
||||
|
||||
for (const synset of synsets) {
|
||||
index++;
|
||||
|
||||
const glosses = db
|
||||
.prepare("SELECT language, text FROM glosses WHERE source_id = ?")
|
||||
.all(synset.source_id) as { language: string; text: string }[];
|
||||
|
||||
const enGloss = glosses.find((g) => g.language === "en")?.text ?? "—";
|
||||
const deGloss = glosses.find((g) => g.language === "de")?.text ?? "—";
|
||||
|
||||
const deTranslations = db
|
||||
.prepare(
|
||||
"SELECT word FROM translations WHERE source_id = ? AND language = 'de'",
|
||||
)
|
||||
.all(synset.source_id) as { word: string }[];
|
||||
|
||||
const enTranslations = db
|
||||
.prepare(
|
||||
"SELECT word FROM translations WHERE source_id = ? AND language = 'en'",
|
||||
)
|
||||
.all(synset.source_id) as { word: string }[];
|
||||
|
||||
const deWords = deTranslations.map((t) => t.word);
|
||||
const enWords = enTranslations.map((t) => t.word);
|
||||
|
||||
results.push(
|
||||
[
|
||||
`${String(index).padStart(2, " ")}. [${synset.pos}] ${synset.source_id}`,
|
||||
` EN gloss: ${enGloss}`,
|
||||
` DE gloss: ${deGloss}`,
|
||||
` EN words: ${enWords.join(", ")}`,
|
||||
` DE words: ${deWords.join(", ")}`,
|
||||
` QUALITY: ___`,
|
||||
``,
|
||||
].join("\n"),
|
||||
);
|
||||
}
|
||||
|
||||
const output = [
|
||||
"# OMW German Translation Quality Audit",
|
||||
"",
|
||||
"Instructions: for each entry, check if the German translations",
|
||||
"match the meaning described by the English gloss.",
|
||||
"",
|
||||
"Mark QUALITY as:",
|
||||
" OK — all German translations fit the meaning",
|
||||
" PARTIAL — some fit, some don't",
|
||||
" BAD — none of the German translations fit",
|
||||
" USELESS — translations are correct but useless for learners",
|
||||
"",
|
||||
"---",
|
||||
"",
|
||||
...results,
|
||||
].join("\n");
|
||||
|
||||
const outPath = path.join(__dirname, "audit.md");
|
||||
fs.writeFileSync(outPath, output, "utf-8");
|
||||
console.log(`Wrote ${synsets.length} entries → ${outPath}`);
|
||||
|
||||
db.close();
|
||||
Loading…
Add table
Add a link
Reference in a new issue