feat(pipeline): add annotate stage

- write annotate.ts — matches CEFR source files against OMW translations - match by word text + normalized POS - add cefr_source vote to matched translations - extract native example sentences from CEFR source files - write one annotated JSON per language to stage-2-annotate/output/ - write conflicts.json for words with multiple CEFR levels - update tsconfig to support all stage directories - 2 German conflicts found (macht, bleiche) - match rates: en 47k, fr 44k, de 26k, it 26k, es 26k
2026-04-21 12:01:56 +02:00 · 2026-04-21 12:01:56 +02:00 · 214a597e99
commit 214a597e99
parent 9ea35568e5
1 changed files with 227 additions and 0 deletions
--- a/data-pipeline/stage-2-annotate/scripts/annotate.ts
+++ b/data-pipeline/stage-2-annotate/scripts/annotate.ts
@ -0,0 +1,227 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
 import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
 // ── Types ────────────────────────────────────────────────────────────────────
 type OmwExample = { text: string; source: "omw" };
 type CefrExample = { text: string; source: "cefr" };
 type Example = OmwExample | CefrExample;
 type OmwRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, string[]>>;
 };
 type AnnotatedRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
  votes: Partial<
    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
  >;
 };
 type CefrSourceEntry = {
  word: string;
  pos: string;
  cefr_level: string;
  example_sentence_native?: string;
 };
 type ConflictEntry = {
  word: string;
  pos: string;
  language: SupportedLanguageCode;
  levels: string[];
 };
 // ── Constants ─────────────────────────────────────────────────────────────────
 const POS_NORMALIZE: Record<string, SupportedPos> = {
  noun: "noun",
  n: "noun",
  nom: "noun", // French
  verb: "verb",
  verbs: "verb",
  v: "verb",
  v1: "verb",
  adjective: "adjective",
  adjektiv: "adjective", // German
  adj: "adjective",
  adverb: "adverb",
  adverbs: "adverb",
  adv: "adverb",
 };
 const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]);
 const PATHS = {
  omw: "stage-1-extract/output/omw.json",
  cefrDir: "stage-2-annotate/sources/cefr",
  outputDir: "stage-2-annotate/output",
 };
 // ── CEFR source loading ───────────────────────────────────────────────────────
 type CefrIndex = Map<string, { level: string; example?: string }>;
 async function loadCefrSource(
  lang: SupportedLanguageCode,
 ): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> {
  const filepath = path.join(PATHS.cefrDir, `${lang}.json`);
  const raw = await fs.readFile(filepath, "utf-8");
  const entries = JSON.parse(raw) as CefrSourceEntry[];
  // First pass — detect conflicts.
  // Structure: "word|pos" -> Set of CEFR levels seen
  const seen = new Map<string, Set<string>>();
  for (const entry of entries) {
    const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
    if (!pos) continue;
    if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
    const key = `${entry.word.toLowerCase().trim()}|${pos}`;
    if (!seen.has(key)) seen.set(key, new Set());
    seen.get(key)!.add(entry.cefr_level);
  }
  const conflicts: ConflictEntry[] = [];
  for (const [key, levels] of seen.entries()) {
    if (levels.size > 1) {
      const [word, pos] = key.split("|") as [string, string];
      conflicts.push({ word, pos, language: lang, levels: [...levels] });
    }
  }
  // Second pass — build index, skip conflicting entries.
  const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`));
  const index: CefrIndex = new Map();
  for (const entry of entries) {
    const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
    if (!pos) continue;
    if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
    const key = `${entry.word.toLowerCase().trim()}|${pos}`;
    if (conflictKeys.has(key)) continue;
    index.set(key, {
      level: entry.cefr_level,
      ...(entry.example_sentence_native
        ? { example: entry.example_sentence_native }
        : {}),
    });
  }
  return { index, conflicts };
 }
 // ── Annotation ────────────────────────────────────────────────────────────────
 async function annotate(): Promise<void> {
  // Load OMW records
  console.log("Reading OMW extract...");
  const raw = await fs.readFile(PATHS.omw, "utf-8");
  const omwRecords = JSON.parse(raw) as OmwRecord[];
  console.log(`  Loaded ${omwRecords.length.toLocaleString()} synsets`);
  // Load CEFR sources for all languages
  console.log("\nLoading CEFR source files...");
  const cefrIndexes = new Map<SupportedLanguageCode, CefrIndex>();
  const allConflicts: ConflictEntry[] = [];
  for (const lang of SUPPORTED_LANGUAGE_CODES) {
    const { index, conflicts } = await loadCefrSource(lang);
    cefrIndexes.set(lang, index);
    allConflicts.push(...conflicts);
    console.log(
      `  ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`,
    );
  }
  // Write conflicts file
  await fs.mkdir(PATHS.outputDir, { recursive: true });
  await fs.writeFile(
    path.join(PATHS.outputDir, "conflicts.json"),
    JSON.stringify(allConflicts, null, 2),
    "utf-8",
  );
  console.log(
    `\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`,
  );
  // Annotate and write one file per language
  console.log("\nAnnotating...");
  for (const lang of SUPPORTED_LANGUAGE_CODES) {
    const index = cefrIndexes.get(lang)!;
    const records: AnnotatedRecord[] = [];
    let matched = 0;
    for (const record of omwRecords) {
      const annotated: AnnotatedRecord = {
        source_id: record.source_id,
        pos: record.pos,
        translations: record.translations,
        glosses: record.glosses,
        examples: {},
        votes: {},
      };
      // Convert OMW examples to typed format
      for (const [l, exList] of Object.entries(record.examples)) {
        annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({
          text,
          source: "omw" as const,
        }));
      }
      // Match translations for this language against CEFR index
      const langTranslations = record.translations[lang] ?? [];
      for (const word of langTranslations) {
        const key = `${word.toLowerCase().trim()}|${record.pos}`;
        const cefrEntry = index.get(key);
        if (!cefrEntry) continue;
        matched++;
        // Add CEFR vote
        if (!annotated.votes[lang]) annotated.votes[lang] = {};
        annotated.votes[lang]![word] = { cefr_source: cefrEntry.level };
        // Add native example if present
        if (cefrEntry.example) {
          if (!annotated.examples[lang]) annotated.examples[lang] = [];
          annotated.examples[lang]!.push({
            text: cefrEntry.example,
            source: "cefr" as const,
          });
        }
      }
      records.push(annotated);
    }
    const outputFile = path.join(PATHS.outputDir, `${lang}.json`);
    await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8");
    console.log(
      `  ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`,
    );
  }
 }
 // ── Main ─────────────────────────────────────────────────────────────────────
 annotate().catch((err) => {
  console.error(err);
  process.exit(1);
 });