feat(pipeline): add annotate stage

- write annotate.ts — matches CEFR source files against OMW translations - match by word text + normalized POS - add cefr_source vote to matched translations - extract native example sentences from CEFR source files - write one annotated JSON per language to stage-2-annotate/output/ - write conflicts.json for words with multiple CEFR levels - update tsconfig to support all stage directories - 2 German conflicts found (macht, bleiche) - match rates: en 47k, fr 44k, de 26k, it 26k, es 26k
2026-04-21 12:01:56 +02:00 · 2026-04-21 12:01:56 +02:00 · 214a597e99
commit 214a597e99
parent 9ea35568e5
1 changed files with 227 additions and 0 deletions
--- a/data-pipeline/stage-2-annotate/scripts/annotate.ts
+++ b/data-pipeline/stage-2-annotate/scripts/annotate.ts
@ -0,0 +1,227 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
+import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
+
+// ── Types ────────────────────────────────────────────────────────────────────
+
+type OmwExample = { text: string; source: "omw" };
+
+type CefrExample = { text: string; source: "cefr" };
+
+type Example = OmwExample | CefrExample;
+
+type OmwRecord = {
+  source_id: string;
+  pos: SupportedPos;
+  translations: Partial<Record<SupportedLanguageCode, string[]>>;
+  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
+  examples: Partial<Record<SupportedLanguageCode, string[]>>;
+};
+
+type AnnotatedRecord = {
+  source_id: string;
+  pos: SupportedPos;
+  translations: Partial<Record<SupportedLanguageCode, string[]>>;
+  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
+  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
+  votes: Partial<
+    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
+  >;
+};
+
+type CefrSourceEntry = {
+  word: string;
+  pos: string;
+  cefr_level: string;
+  example_sentence_native?: string;
+};
+
+type ConflictEntry = {
+  word: string;
+  pos: string;
+  language: SupportedLanguageCode;
+  levels: string[];
+};
+
+// ── Constants ─────────────────────────────────────────────────────────────────
+
+const POS_NORMALIZE: Record<string, SupportedPos> = {
+  noun: "noun",
+  n: "noun",
+  nom: "noun", // French
+  verb: "verb",
+  verbs: "verb",
+  v: "verb",
+  v1: "verb",
+  adjective: "adjective",
+  adjektiv: "adjective", // German
+  adj: "adjective",
+  adverb: "adverb",
+  adverbs: "adverb",
+  adv: "adverb",
+};
+
+const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]);
+
+const PATHS = {
+  omw: "stage-1-extract/output/omw.json",
+  cefrDir: "stage-2-annotate/sources/cefr",
+  outputDir: "stage-2-annotate/output",
+};
+
+// ── CEFR source loading ───────────────────────────────────────────────────────
+
+type CefrIndex = Map<string, { level: string; example?: string }>;
+
+async function loadCefrSource(
+  lang: SupportedLanguageCode,
+): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> {
+  const filepath = path.join(PATHS.cefrDir, `${lang}.json`);
+  const raw = await fs.readFile(filepath, "utf-8");
+  const entries = JSON.parse(raw) as CefrSourceEntry[];
+
+  // First pass — detect conflicts.
+  // Structure: "word|pos" -> Set of CEFR levels seen
+  const seen = new Map<string, Set<string>>();
+
+  for (const entry of entries) {
+    const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
+    if (!pos) continue;
+    if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
+
+    const key = `${entry.word.toLowerCase().trim()}|${pos}`;
+    if (!seen.has(key)) seen.set(key, new Set());
+    seen.get(key)!.add(entry.cefr_level);
+  }
+
+  const conflicts: ConflictEntry[] = [];
+  for (const [key, levels] of seen.entries()) {
+    if (levels.size > 1) {
+      const [word, pos] = key.split("|") as [string, string];
+      conflicts.push({ word, pos, language: lang, levels: [...levels] });
+    }
+  }
+
+  // Second pass — build index, skip conflicting entries.
+  const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`));
+
+  const index: CefrIndex = new Map();
+  for (const entry of entries) {
+    const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
+    if (!pos) continue;
+    if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
+
+    const key = `${entry.word.toLowerCase().trim()}|${pos}`;
+    if (conflictKeys.has(key)) continue;
+
+    index.set(key, {
+      level: entry.cefr_level,
+      ...(entry.example_sentence_native
+        ? { example: entry.example_sentence_native }
+        : {}),
+    });
+  }
+
+  return { index, conflicts };
+}
+
+// ── Annotation ────────────────────────────────────────────────────────────────
+
+async function annotate(): Promise<void> {
+  // Load OMW records
+  console.log("Reading OMW extract...");
+  const raw = await fs.readFile(PATHS.omw, "utf-8");
+  const omwRecords = JSON.parse(raw) as OmwRecord[];
+  console.log(`  Loaded ${omwRecords.length.toLocaleString()} synsets`);
+
+  // Load CEFR sources for all languages
+  console.log("\nLoading CEFR source files...");
+  const cefrIndexes = new Map<SupportedLanguageCode, CefrIndex>();
+  const allConflicts: ConflictEntry[] = [];
+
+  for (const lang of SUPPORTED_LANGUAGE_CODES) {
+    const { index, conflicts } = await loadCefrSource(lang);
+    cefrIndexes.set(lang, index);
+    allConflicts.push(...conflicts);
+    console.log(
+      `  ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`,
+    );
+  }
+
+  // Write conflicts file
+  await fs.mkdir(PATHS.outputDir, { recursive: true });
+  await fs.writeFile(
+    path.join(PATHS.outputDir, "conflicts.json"),
+    JSON.stringify(allConflicts, null, 2),
+    "utf-8",
+  );
+  console.log(
+    `\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`,
+  );
+
+  // Annotate and write one file per language
+  console.log("\nAnnotating...");
+  for (const lang of SUPPORTED_LANGUAGE_CODES) {
+    const index = cefrIndexes.get(lang)!;
+    const records: AnnotatedRecord[] = [];
+    let matched = 0;
+
+    for (const record of omwRecords) {
+      const annotated: AnnotatedRecord = {
+        source_id: record.source_id,
+        pos: record.pos,
+        translations: record.translations,
+        glosses: record.glosses,
+        examples: {},
+        votes: {},
+      };
+
+      // Convert OMW examples to typed format
+      for (const [l, exList] of Object.entries(record.examples)) {
+        annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({
+          text,
+          source: "omw" as const,
+        }));
+      }
+
+      // Match translations for this language against CEFR index
+      const langTranslations = record.translations[lang] ?? [];
+      for (const word of langTranslations) {
+        const key = `${word.toLowerCase().trim()}|${record.pos}`;
+        const cefrEntry = index.get(key);
+        if (!cefrEntry) continue;
+
+        matched++;
+
+        // Add CEFR vote
+        if (!annotated.votes[lang]) annotated.votes[lang] = {};
+        annotated.votes[lang]![word] = { cefr_source: cefrEntry.level };
+
+        // Add native example if present
+        if (cefrEntry.example) {
+          if (!annotated.examples[lang]) annotated.examples[lang] = [];
+          annotated.examples[lang]!.push({
+            text: cefrEntry.example,
+            source: "cefr" as const,
+          });
+        }
+      }
+
+      records.push(annotated);
+    }
+
+    const outputFile = path.join(PATHS.outputDir, `${lang}.json`);
+    await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8");
+    console.log(
+      `  ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`,
+    );
+  }
+}
+
+// ── Main ─────────────────────────────────────────────────────────────────────
+
+annotate().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});