From 214a597e99da8d445ad633e71ea744120d8c784b Mon Sep 17 00:00:00 2001 From: lila Date: Tue, 21 Apr 2026 12:01:56 +0200 Subject: [PATCH] feat(pipeline): add annotate stage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - write annotate.ts — matches CEFR source files against OMW translations - match by word text + normalized POS - add cefr_source vote to matched translations - extract native example sentences from CEFR source files - write one annotated JSON per language to stage-2-annotate/output/ - write conflicts.json for words with multiple CEFR levels - update tsconfig to support all stage directories - 2 German conflicts found (macht, bleiche) - match rates: en 47k, fr 44k, de 26k, it 26k, es 26k --- .../stage-2-annotate/scripts/annotate.ts | 227 ++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 data-pipeline/stage-2-annotate/scripts/annotate.ts diff --git a/data-pipeline/stage-2-annotate/scripts/annotate.ts b/data-pipeline/stage-2-annotate/scripts/annotate.ts new file mode 100644 index 0000000..bb71f60 --- /dev/null +++ b/data-pipeline/stage-2-annotate/scripts/annotate.ts @@ -0,0 +1,227 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; +import type { SupportedLanguageCode, SupportedPos } from "@lila/shared"; + +// ── Types ──────────────────────────────────────────────────────────────────── + +type OmwExample = { text: string; source: "omw" }; + +type CefrExample = { text: string; source: "cefr" }; + +type Example = OmwExample | CefrExample; + +type OmwRecord = { + source_id: string; + pos: SupportedPos; + translations: Partial>; + glosses: Partial>; + examples: Partial>; +}; + +type AnnotatedRecord = { + source_id: string; + pos: SupportedPos; + translations: Partial>; + glosses: Partial>; + examples: Partial>; + votes: Partial< + Record> + >; +}; + +type CefrSourceEntry = { + word: string; + pos: string; + cefr_level: string; + example_sentence_native?: string; +}; + +type ConflictEntry = { + word: string; + pos: string; + language: SupportedLanguageCode; + levels: string[]; +}; + +// ── Constants ───────────────────────────────────────────────────────────────── + +const POS_NORMALIZE: Record = { + noun: "noun", + n: "noun", + nom: "noun", // French + verb: "verb", + verbs: "verb", + v: "verb", + v1: "verb", + adjective: "adjective", + adjektiv: "adjective", // German + adj: "adjective", + adverb: "adverb", + adverbs: "adverb", + adv: "adverb", +}; + +const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]); + +const PATHS = { + omw: "stage-1-extract/output/omw.json", + cefrDir: "stage-2-annotate/sources/cefr", + outputDir: "stage-2-annotate/output", +}; + +// ── CEFR source loading ─────────────────────────────────────────────────────── + +type CefrIndex = Map; + +async function loadCefrSource( + lang: SupportedLanguageCode, +): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> { + const filepath = path.join(PATHS.cefrDir, `${lang}.json`); + const raw = await fs.readFile(filepath, "utf-8"); + const entries = JSON.parse(raw) as CefrSourceEntry[]; + + // First pass — detect conflicts. + // Structure: "word|pos" -> Set of CEFR levels seen + const seen = new Map>(); + + for (const entry of entries) { + const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()]; + if (!pos) continue; + if (!CEFR_LEVELS.has(entry.cefr_level)) continue; + + const key = `${entry.word.toLowerCase().trim()}|${pos}`; + if (!seen.has(key)) seen.set(key, new Set()); + seen.get(key)!.add(entry.cefr_level); + } + + const conflicts: ConflictEntry[] = []; + for (const [key, levels] of seen.entries()) { + if (levels.size > 1) { + const [word, pos] = key.split("|") as [string, string]; + conflicts.push({ word, pos, language: lang, levels: [...levels] }); + } + } + + // Second pass — build index, skip conflicting entries. + const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`)); + + const index: CefrIndex = new Map(); + for (const entry of entries) { + const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()]; + if (!pos) continue; + if (!CEFR_LEVELS.has(entry.cefr_level)) continue; + + const key = `${entry.word.toLowerCase().trim()}|${pos}`; + if (conflictKeys.has(key)) continue; + + index.set(key, { + level: entry.cefr_level, + ...(entry.example_sentence_native + ? { example: entry.example_sentence_native } + : {}), + }); + } + + return { index, conflicts }; +} + +// ── Annotation ──────────────────────────────────────────────────────────────── + +async function annotate(): Promise { + // Load OMW records + console.log("Reading OMW extract..."); + const raw = await fs.readFile(PATHS.omw, "utf-8"); + const omwRecords = JSON.parse(raw) as OmwRecord[]; + console.log(` Loaded ${omwRecords.length.toLocaleString()} synsets`); + + // Load CEFR sources for all languages + console.log("\nLoading CEFR source files..."); + const cefrIndexes = new Map(); + const allConflicts: ConflictEntry[] = []; + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const { index, conflicts } = await loadCefrSource(lang); + cefrIndexes.set(lang, index); + allConflicts.push(...conflicts); + console.log( + ` ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`, + ); + } + + // Write conflicts file + await fs.mkdir(PATHS.outputDir, { recursive: true }); + await fs.writeFile( + path.join(PATHS.outputDir, "conflicts.json"), + JSON.stringify(allConflicts, null, 2), + "utf-8", + ); + console.log( + `\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`, + ); + + // Annotate and write one file per language + console.log("\nAnnotating..."); + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const index = cefrIndexes.get(lang)!; + const records: AnnotatedRecord[] = []; + let matched = 0; + + for (const record of omwRecords) { + const annotated: AnnotatedRecord = { + source_id: record.source_id, + pos: record.pos, + translations: record.translations, + glosses: record.glosses, + examples: {}, + votes: {}, + }; + + // Convert OMW examples to typed format + for (const [l, exList] of Object.entries(record.examples)) { + annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({ + text, + source: "omw" as const, + })); + } + + // Match translations for this language against CEFR index + const langTranslations = record.translations[lang] ?? []; + for (const word of langTranslations) { + const key = `${word.toLowerCase().trim()}|${record.pos}`; + const cefrEntry = index.get(key); + if (!cefrEntry) continue; + + matched++; + + // Add CEFR vote + if (!annotated.votes[lang]) annotated.votes[lang] = {}; + annotated.votes[lang]![word] = { cefr_source: cefrEntry.level }; + + // Add native example if present + if (cefrEntry.example) { + if (!annotated.examples[lang]) annotated.examples[lang] = []; + annotated.examples[lang]!.push({ + text: cefrEntry.example, + source: "cefr" as const, + }); + } + } + + records.push(annotated); + } + + const outputFile = path.join(PATHS.outputDir, `${lang}.json`); + await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8"); + console.log( + ` ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`, + ); + } +} + +// ── Main ───────────────────────────────────────────────────────────────────── + +annotate().catch((err) => { + console.error(err); + process.exit(1); +});