import fs from "node:fs/promises"; import path from "node:path"; import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; import type { SupportedLanguageCode, SupportedPos } from "@lila/shared"; // ── Types ───────────────────────────────────────────────────────────────────── type Example = { text: string; source: "omw" | "cefr" }; type AnnotatedRecord = { source_id: string; pos: SupportedPos; translations: Partial>; glosses: Partial>; examples: Partial>; votes: Partial< Record> >; }; type SampleRecord = AnnotatedRecord & { _sample_bucket: string }; // ── Constants ───────────────────────────────────────────────────────────────── const PATHS = { annotatedDir: "stage-2-annotate/output", output: "test/output/sample.json", }; const BUCKET_SIZE = 20; // ── Bucket predicates ───────────────────────────────────────────────────────── type Bucket = { name: string; predicate: (record: AnnotatedRecord) => boolean }; const BUCKETS: Bucket[] = [ { name: "has_cefr_vote", predicate: (r) => Object.values(r.votes).some( (langVotes) => Object.keys(langVotes ?? {}).length > 0, ), }, { name: "no_cefr_vote", predicate: (r) => Object.values(r.votes).every( (langVotes) => Object.keys(langVotes ?? {}).length === 0, ), }, { name: "has_glosses_and_examples", predicate: (r) => Object.keys(r.glosses).length > 0 && Object.keys(r.examples).length > 0, }, { name: "no_glosses_no_examples", predicate: (r) => !r.glosses["fr"] && !r.examples["fr"] && !r.votes["fr"] && !r.glosses["es"] && !r.examples["es"] && !r.votes["es"], }, { name: "pos_spread", predicate: () => true, // sampled separately to ensure POS coverage }, ]; // ── Sampling ────────────────────────────────────────────────────────────────── function sampleBucket( records: AnnotatedRecord[], predicate: (r: AnnotatedRecord) => boolean, size: number, exclude: Set, ): AnnotatedRecord[] { const candidates = records.filter( (r) => !exclude.has(r.source_id) && predicate(r), ); // Shuffle for random sampling for (let i = candidates.length - 1; i > 0; i--) { const j = Math.floor(Math.random() * (i + 1)); [candidates[i], candidates[j]] = [candidates[j]!, candidates[i]!]; } return candidates.slice(0, size); } function samplePosBucket( records: AnnotatedRecord[], exclude: Set, ): AnnotatedRecord[] { const posList: SupportedPos[] = ["noun", "verb", "adjective", "adverb"]; const perPos = Math.floor(BUCKET_SIZE / posList.length); const result: AnnotatedRecord[] = []; for (const pos of posList) { const sampled = sampleBucket( records, (r) => r.pos === pos, perPos, exclude, ); result.push(...sampled); } return result; } // ── Loading ─────────────────────────────────────────────────────────────────── async function loadAnnotated(): Promise { // Load all language files and merge votes into a single record set. // Use en.json as the base record structure since it has the most complete // glosses and examples. Votes from all other languages are merged in. const baseRaw = await fs.readFile( path.join(PATHS.annotatedDir, "en.json"), "utf-8", ); const base = JSON.parse(baseRaw) as AnnotatedRecord[]; // Build a map for fast lookup by source_id const byId = new Map(); for (const record of base) { byId.set(record.source_id, record); } // Merge votes from remaining language files for (const lang of SUPPORTED_LANGUAGE_CODES) { if (lang === "en") continue; const raw = await fs.readFile( path.join(PATHS.annotatedDir, `${lang}.json`), "utf-8", ); const records = JSON.parse(raw) as AnnotatedRecord[]; for (const record of records) { const base = byId.get(record.source_id); if (!base) continue; // Merge votes for (const [l, langVotes] of Object.entries(record.votes)) { if (!base.votes[l as SupportedLanguageCode]) { base.votes[l as SupportedLanguageCode] = {}; } Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes); } // Merge examples from CEFR source files not in base for (const [l, examples] of Object.entries(record.examples)) { const lang = l as SupportedLanguageCode; if (!base.examples[lang]) { base.examples[lang] = examples; } } } } return [...byId.values()]; } // ── Main ───────────────────────────────────────────────────────────────────── async function main(): Promise { console.log("Loading annotated files..."); const records = await loadAnnotated(); console.log(` Loaded ${records.length.toLocaleString()} synsets`); const sampled: SampleRecord[] = []; const seen = new Set(); // Sample each bucket except pos_spread for (const bucket of BUCKETS.filter((b) => b.name !== "pos_spread")) { const results = sampleBucket(records, bucket.predicate, BUCKET_SIZE, seen); for (const r of results) { seen.add(r.source_id); sampled.push({ ...r, _sample_bucket: bucket.name }); } console.log(` ${bucket.name}: ${results.length} records`); } // Sample pos_spread bucket const posResults = samplePosBucket(records, seen); for (const r of posResults) { seen.add(r.source_id); sampled.push({ ...r, _sample_bucket: "pos_spread" }); } console.log(` pos_spread: ${posResults.length} records`); console.log(`\nTotal sampled: ${sampled.length} records`); // Write output await fs.mkdir(path.dirname(PATHS.output), { recursive: true }); await fs.writeFile(PATHS.output, JSON.stringify(sampled, null, 2), "utf-8"); console.log(`Wrote sample → ${PATHS.output}`); } main().catch((err) => { console.error(err); process.exit(1); });