lila/data-pipeline/test/scripts/sample.ts

import fs from "node:fs/promises";
import path from "node:path";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";

// ── Types ─────────────────────────────────────────────────────────────────────

type Example = { text: string; source: "omw" | "cefr" };

type AnnotatedRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
  votes: Partial<
    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
  >;
};

type SampleRecord = AnnotatedRecord & { _sample_bucket: string };

// ── Constants ─────────────────────────────────────────────────────────────────

const PATHS = {
  annotatedDir: "stage-2-annotate/output",
  output: "test/output/sample.json",
};

const BUCKET_SIZE = 20;

// ── Bucket predicates ─────────────────────────────────────────────────────────

type Bucket = { name: string; predicate: (record: AnnotatedRecord) => boolean };

const BUCKETS: Bucket[] = [
  {
    name: "has_cefr_vote",
    predicate: (r) =>
      Object.values(r.votes).some(
        (langVotes) => Object.keys(langVotes ?? {}).length > 0,
      ),
  },
  {
    name: "no_cefr_vote",
    predicate: (r) =>
      Object.values(r.votes).every(
        (langVotes) => Object.keys(langVotes ?? {}).length === 0,
      ),
  },
  {
    name: "has_glosses_and_examples",
    predicate: (r) =>
      Object.keys(r.glosses).length > 0 && Object.keys(r.examples).length > 0,
  },
  {
    name: "no_glosses_no_examples",
    predicate: (r) =>
      !r.glosses["fr"] &&
      !r.examples["fr"] &&
      !r.votes["fr"] &&
      !r.glosses["es"] &&
      !r.examples["es"] &&
      !r.votes["es"],
  },
  {
    name: "pos_spread",
    predicate: () => true, // sampled separately to ensure POS coverage
  },
];

// ── Sampling ──────────────────────────────────────────────────────────────────

function sampleBucket(
  records: AnnotatedRecord[],
  predicate: (r: AnnotatedRecord) => boolean,
  size: number,
  exclude: Set<string>,
): AnnotatedRecord[] {
  const candidates = records.filter(
    (r) => !exclude.has(r.source_id) && predicate(r),
  );

  // Shuffle for random sampling
  for (let i = candidates.length - 1; i > 0; i--) {
    const j = Math.floor(Math.random() * (i + 1));
    [candidates[i], candidates[j]] = [candidates[j]!, candidates[i]!];
  }

  return candidates.slice(0, size);
}

function samplePosBucket(
  records: AnnotatedRecord[],
  exclude: Set<string>,
): AnnotatedRecord[] {
  const posList: SupportedPos[] = ["noun", "verb", "adjective", "adverb"];
  const perPos = Math.floor(BUCKET_SIZE / posList.length);
  const result: AnnotatedRecord[] = [];

  for (const pos of posList) {
    const sampled = sampleBucket(
      records,
      (r) => r.pos === pos,
      perPos,
      exclude,
    );
    result.push(...sampled);
  }

  return result;
}

// ── Loading ───────────────────────────────────────────────────────────────────

async function loadAnnotated(): Promise<AnnotatedRecord[]> {
  // Load all language files and merge votes into a single record set.
  // Use en.json as the base record structure since it has the most complete
  // glosses and examples. Votes from all other languages are merged in.
  const baseRaw = await fs.readFile(
    path.join(PATHS.annotatedDir, "en.json"),
    "utf-8",
  );
  const base = JSON.parse(baseRaw) as AnnotatedRecord[];

  // Build a map for fast lookup by source_id
  const byId = new Map<string, AnnotatedRecord>();
  for (const record of base) {
    byId.set(record.source_id, record);
  }

  // Merge votes from remaining language files
  for (const lang of SUPPORTED_LANGUAGE_CODES) {
    if (lang === "en") continue;
    const raw = await fs.readFile(
      path.join(PATHS.annotatedDir, `${lang}.json`),
      "utf-8",
    );
    const records = JSON.parse(raw) as AnnotatedRecord[];

    for (const record of records) {
      const base = byId.get(record.source_id);
      if (!base) continue;

      // Merge votes
      for (const [l, langVotes] of Object.entries(record.votes)) {
        if (!base.votes[l as SupportedLanguageCode]) {
          base.votes[l as SupportedLanguageCode] = {};
        }
        Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
      }

      // Merge examples from CEFR source files not in base
      for (const [l, examples] of Object.entries(record.examples)) {
        const lang = l as SupportedLanguageCode;
        if (!base.examples[lang]) {
          base.examples[lang] = examples as Example[];
        }
      }
    }
  }

  return [...byId.values()];
}

// ── Main ─────────────────────────────────────────────────────────────────────

async function main(): Promise<void> {
  console.log("Loading annotated files...");
  const records = await loadAnnotated();
  console.log(`  Loaded ${records.length.toLocaleString()} synsets`);

  const sampled: SampleRecord[] = [];
  const seen = new Set<string>();

  // Sample each bucket except pos_spread
  for (const bucket of BUCKETS.filter((b) => b.name !== "pos_spread")) {
    const results = sampleBucket(records, bucket.predicate, BUCKET_SIZE, seen);
    for (const r of results) {
      seen.add(r.source_id);
      sampled.push({ ...r, _sample_bucket: bucket.name });
    }
    console.log(`  ${bucket.name}: ${results.length} records`);
  }

  // Sample pos_spread bucket
  const posResults = samplePosBucket(records, seen);
  for (const r of posResults) {
    seen.add(r.source_id);
    sampled.push({ ...r, _sample_bucket: "pos_spread" });
  }
  console.log(`  pos_spread: ${posResults.length} records`);

  console.log(`\nTotal sampled: ${sampled.length} records`);

  // Write output
  await fs.mkdir(path.dirname(PATHS.output), { recursive: true });
  await fs.writeFile(PATHS.output, JSON.stringify(sampled, null, 2), "utf-8");
  console.log(`Wrote sample → ${PATHS.output}`);
}

main().catch((err) => {
  console.error(err);
  process.exit(1);
});