feat(pipeline): add annotate stage

- write annotate.ts — matches CEFR source files against OMW translations
- match by word text + normalized POS
- add cefr_source vote to matched translations
- extract native example sentences from CEFR source files
- write one annotated JSON per language to stage-2-annotate/output/
- write conflicts.json for words with multiple CEFR levels
- update tsconfig to support all stage directories
- 2 German conflicts found (macht, bleiche)
- match rates: en 47k, fr 44k, de 26k, it 26k, es 26k
This commit is contained in:
lila 2026-04-21 12:01:56 +02:00
parent 9ea35568e5
commit 214a597e99

View file

@ -0,0 +1,227 @@
import fs from "node:fs/promises";
import path from "node:path";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
// ── Types ────────────────────────────────────────────────────────────────────
type OmwExample = { text: string; source: "omw" };
type CefrExample = { text: string; source: "cefr" };
type Example = OmwExample | CefrExample;
type OmwRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, string[]>>;
};
type AnnotatedRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
votes: Partial<
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
>;
};
type CefrSourceEntry = {
word: string;
pos: string;
cefr_level: string;
example_sentence_native?: string;
};
type ConflictEntry = {
word: string;
pos: string;
language: SupportedLanguageCode;
levels: string[];
};
// ── Constants ─────────────────────────────────────────────────────────────────
const POS_NORMALIZE: Record<string, SupportedPos> = {
noun: "noun",
n: "noun",
nom: "noun", // French
verb: "verb",
verbs: "verb",
v: "verb",
v1: "verb",
adjective: "adjective",
adjektiv: "adjective", // German
adj: "adjective",
adverb: "adverb",
adverbs: "adverb",
adv: "adverb",
};
const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]);
const PATHS = {
omw: "stage-1-extract/output/omw.json",
cefrDir: "stage-2-annotate/sources/cefr",
outputDir: "stage-2-annotate/output",
};
// ── CEFR source loading ───────────────────────────────────────────────────────
type CefrIndex = Map<string, { level: string; example?: string }>;
async function loadCefrSource(
lang: SupportedLanguageCode,
): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> {
const filepath = path.join(PATHS.cefrDir, `${lang}.json`);
const raw = await fs.readFile(filepath, "utf-8");
const entries = JSON.parse(raw) as CefrSourceEntry[];
// First pass — detect conflicts.
// Structure: "word|pos" -> Set of CEFR levels seen
const seen = new Map<string, Set<string>>();
for (const entry of entries) {
const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
if (!pos) continue;
if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
const key = `${entry.word.toLowerCase().trim()}|${pos}`;
if (!seen.has(key)) seen.set(key, new Set());
seen.get(key)!.add(entry.cefr_level);
}
const conflicts: ConflictEntry[] = [];
for (const [key, levels] of seen.entries()) {
if (levels.size > 1) {
const [word, pos] = key.split("|") as [string, string];
conflicts.push({ word, pos, language: lang, levels: [...levels] });
}
}
// Second pass — build index, skip conflicting entries.
const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`));
const index: CefrIndex = new Map();
for (const entry of entries) {
const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
if (!pos) continue;
if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
const key = `${entry.word.toLowerCase().trim()}|${pos}`;
if (conflictKeys.has(key)) continue;
index.set(key, {
level: entry.cefr_level,
...(entry.example_sentence_native
? { example: entry.example_sentence_native }
: {}),
});
}
return { index, conflicts };
}
// ── Annotation ────────────────────────────────────────────────────────────────
async function annotate(): Promise<void> {
// Load OMW records
console.log("Reading OMW extract...");
const raw = await fs.readFile(PATHS.omw, "utf-8");
const omwRecords = JSON.parse(raw) as OmwRecord[];
console.log(` Loaded ${omwRecords.length.toLocaleString()} synsets`);
// Load CEFR sources for all languages
console.log("\nLoading CEFR source files...");
const cefrIndexes = new Map<SupportedLanguageCode, CefrIndex>();
const allConflicts: ConflictEntry[] = [];
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const { index, conflicts } = await loadCefrSource(lang);
cefrIndexes.set(lang, index);
allConflicts.push(...conflicts);
console.log(
` ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`,
);
}
// Write conflicts file
await fs.mkdir(PATHS.outputDir, { recursive: true });
await fs.writeFile(
path.join(PATHS.outputDir, "conflicts.json"),
JSON.stringify(allConflicts, null, 2),
"utf-8",
);
console.log(
`\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`,
);
// Annotate and write one file per language
console.log("\nAnnotating...");
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const index = cefrIndexes.get(lang)!;
const records: AnnotatedRecord[] = [];
let matched = 0;
for (const record of omwRecords) {
const annotated: AnnotatedRecord = {
source_id: record.source_id,
pos: record.pos,
translations: record.translations,
glosses: record.glosses,
examples: {},
votes: {},
};
// Convert OMW examples to typed format
for (const [l, exList] of Object.entries(record.examples)) {
annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({
text,
source: "omw" as const,
}));
}
// Match translations for this language against CEFR index
const langTranslations = record.translations[lang] ?? [];
for (const word of langTranslations) {
const key = `${word.toLowerCase().trim()}|${record.pos}`;
const cefrEntry = index.get(key);
if (!cefrEntry) continue;
matched++;
// Add CEFR vote
if (!annotated.votes[lang]) annotated.votes[lang] = {};
annotated.votes[lang]![word] = { cefr_source: cefrEntry.level };
// Add native example if present
if (cefrEntry.example) {
if (!annotated.examples[lang]) annotated.examples[lang] = [];
annotated.examples[lang]!.push({
text: cefrEntry.example,
source: "cefr" as const,
});
}
}
records.push(annotated);
}
const outputFile = path.join(PATHS.outputDir, `${lang}.json`);
await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8");
console.log(
` ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`,
);
}
}
// ── Main ─────────────────────────────────────────────────────────────────────
annotate().catch((err) => {
console.error(err);
process.exit(1);
});