feat: add db schema, init, and vitest config

2026-05-03 17:56:29 +02:00 · 2026-05-03 17:56:29 +02:00 · 4fa3073412
commit 4fa3073412
parent 74cfc82bdd
13 changed files with 248 additions and 8 deletions
--- a/data-pipeline/sample/output/sample.json
+++ b/data-pipeline/sample/output/sample.json
--- a/data-pipeline/sample/scripts/sample.ts
+++ b/data-pipeline/sample/scripts/sample.ts
@ -0,0 +1,205 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
+import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+type Example = { text: string; source: "omw" | "cefr" };
+
+type AnnotatedRecord = {
+  source_id: string;
+  pos: SupportedPos;
+  translations: Partial<Record<SupportedLanguageCode, string[]>>;
+  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
+  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
+  votes: Partial<
+    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
+  >;
+};
+
+type SampleRecord = AnnotatedRecord & { _sample_bucket: string };
+
+// ── Constants ─────────────────────────────────────────────────────────────────
+
+const PATHS = {
+  annotatedDir: "stage-2-annotate/output",
+  output: "test/output/sample.json",
+};
+
+const BUCKET_SIZE = 20;
+
+// ── Bucket predicates ─────────────────────────────────────────────────────────
+
+type Bucket = { name: string; predicate: (record: AnnotatedRecord) => boolean };
+
+const BUCKETS: Bucket[] = [
+  {
+    name: "has_cefr_vote",
+    predicate: (r) =>
+      Object.values(r.votes).some(
+        (langVotes) => Object.keys(langVotes ?? {}).length > 0,
+      ),
+  },
+  {
+    name: "no_cefr_vote",
+    predicate: (r) =>
+      Object.values(r.votes).every(
+        (langVotes) => Object.keys(langVotes ?? {}).length === 0,
+      ),
+  },
+  {
+    name: "has_glosses_and_examples",
+    predicate: (r) =>
+      Object.keys(r.glosses).length > 0 && Object.keys(r.examples).length > 0,
+  },
+  {
+    name: "no_glosses_no_examples",
+    predicate: (r) =>
+      !r.glosses["fr"] &&
+      !r.examples["fr"] &&
+      !r.votes["fr"] &&
+      !r.glosses["es"] &&
+      !r.examples["es"] &&
+      !r.votes["es"],
+  },
+  {
+    name: "pos_spread",
+    predicate: () => true, // sampled separately to ensure POS coverage
+  },
+];
+
+// ── Sampling ──────────────────────────────────────────────────────────────────
+
+function sampleBucket(
+  records: AnnotatedRecord[],
+  predicate: (r: AnnotatedRecord) => boolean,
+  size: number,
+  exclude: Set<string>,
+): AnnotatedRecord[] {
+  const candidates = records.filter(
+    (r) => !exclude.has(r.source_id) && predicate(r),
+  );
+
+  // Shuffle for random sampling
+  for (let i = candidates.length - 1; i > 0; i--) {
+    const j = Math.floor(Math.random() * (i + 1));
+    [candidates[i], candidates[j]] = [candidates[j]!, candidates[i]!];
+  }
+
+  return candidates.slice(0, size);
+}
+
+function samplePosBucket(
+  records: AnnotatedRecord[],
+  exclude: Set<string>,
+): AnnotatedRecord[] {
+  const posList: SupportedPos[] = ["noun", "verb", "adjective", "adverb"];
+  const perPos = Math.floor(BUCKET_SIZE / posList.length);
+  const result: AnnotatedRecord[] = [];
+
+  for (const pos of posList) {
+    const sampled = sampleBucket(
+      records,
+      (r) => r.pos === pos,
+      perPos,
+      exclude,
+    );
+    result.push(...sampled);
+  }
+
+  return result;
+}
+
+// ── Loading ───────────────────────────────────────────────────────────────────
+
+async function loadAnnotated(): Promise<AnnotatedRecord[]> {
+  // Load all language files and merge votes into a single record set.
+  // Use en.json as the base record structure since it has the most complete
+  // glosses and examples. Votes from all other languages are merged in.
+  const baseRaw = await fs.readFile(
+    path.join(PATHS.annotatedDir, "en.json"),
+    "utf-8",
+  );
+  const base = JSON.parse(baseRaw) as AnnotatedRecord[];
+
+  // Build a map for fast lookup by source_id
+  const byId = new Map<string, AnnotatedRecord>();
+  for (const record of base) {
+    byId.set(record.source_id, record);
+  }
+
+  // Merge votes from remaining language files
+  for (const lang of SUPPORTED_LANGUAGE_CODES) {
+    if (lang === "en") continue;
+    const raw = await fs.readFile(
+      path.join(PATHS.annotatedDir, `${lang}.json`),
+      "utf-8",
+    );
+    const records = JSON.parse(raw) as AnnotatedRecord[];
+
+    for (const record of records) {
+      const base = byId.get(record.source_id);
+      if (!base) continue;
+
+      // Merge votes
+      for (const [l, langVotes] of Object.entries(record.votes)) {
+        if (!base.votes[l as SupportedLanguageCode]) {
+          base.votes[l as SupportedLanguageCode] = {};
+        }
+        Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
+      }
+
+      // Merge examples from CEFR source files not in base
+      for (const [l, examples] of Object.entries(record.examples)) {
+        const lang = l as SupportedLanguageCode;
+        if (!base.examples[lang]) {
+          base.examples[lang] = examples;
+        }
+      }
+    }
+  }
+
+  return [...byId.values()];
+}
+
+// ── Main ─────────────────────────────────────────────────────────────────────
+
+async function main(): Promise<void> {
+  console.log("Loading annotated files...");
+  const records = await loadAnnotated();
+  console.log(`  Loaded ${records.length.toLocaleString()} synsets`);
+
+  const sampled: SampleRecord[] = [];
+  const seen = new Set<string>();
+
+  // Sample each bucket except pos_spread
+  for (const bucket of BUCKETS.filter((b) => b.name !== "pos_spread")) {
+    const results = sampleBucket(records, bucket.predicate, BUCKET_SIZE, seen);
+    for (const r of results) {
+      seen.add(r.source_id);
+      sampled.push({ ...r, _sample_bucket: bucket.name });
+    }
+    console.log(`  ${bucket.name}: ${results.length} records`);
+  }
+
+  // Sample pos_spread bucket
+  const posResults = samplePosBucket(records, seen);
+  for (const r of posResults) {
+    seen.add(r.source_id);
+    sampled.push({ ...r, _sample_bucket: "pos_spread" });
+  }
+  console.log(`  pos_spread: ${posResults.length} records`);
+
+  console.log(`\nTotal sampled: ${sampled.length} records`);
+
+  // Write output
+  await fs.mkdir(path.dirname(PATHS.output), { recursive: true });
+  await fs.writeFile(PATHS.output, JSON.stringify(sampled, null, 2), "utf-8");
+  console.log(`Wrote sample → ${PATHS.output}`);
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});