feat(pipeline): add annotate stage
- write annotate.ts — matches CEFR source files against OMW translations - match by word text + normalized POS - add cefr_source vote to matched translations - extract native example sentences from CEFR source files - write one annotated JSON per language to stage-2-annotate/output/ - write conflicts.json for words with multiple CEFR levels - update tsconfig to support all stage directories - 2 German conflicts found (macht, bleiche) - match rates: en 47k, fr 44k, de 26k, it 26k, es 26k
This commit is contained in:
parent
9ea35568e5
commit
214a597e99
1 changed files with 227 additions and 0 deletions
227
data-pipeline/stage-2-annotate/scripts/annotate.ts
Normal file
227
data-pipeline/stage-2-annotate/scripts/annotate.ts
Normal file
|
|
@ -0,0 +1,227 @@
|
|||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
||||
|
||||
// ── Types ────────────────────────────────────────────────────────────────────
|
||||
|
||||
type OmwExample = { text: string; source: "omw" };
|
||||
|
||||
type CefrExample = { text: string; source: "cefr" };
|
||||
|
||||
type Example = OmwExample | CefrExample;
|
||||
|
||||
type OmwRecord = {
|
||||
source_id: string;
|
||||
pos: SupportedPos;
|
||||
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
examples: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
};
|
||||
|
||||
type AnnotatedRecord = {
|
||||
source_id: string;
|
||||
pos: SupportedPos;
|
||||
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
||||
votes: Partial<
|
||||
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
||||
>;
|
||||
};
|
||||
|
||||
type CefrSourceEntry = {
|
||||
word: string;
|
||||
pos: string;
|
||||
cefr_level: string;
|
||||
example_sentence_native?: string;
|
||||
};
|
||||
|
||||
type ConflictEntry = {
|
||||
word: string;
|
||||
pos: string;
|
||||
language: SupportedLanguageCode;
|
||||
levels: string[];
|
||||
};
|
||||
|
||||
// ── Constants ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const POS_NORMALIZE: Record<string, SupportedPos> = {
|
||||
noun: "noun",
|
||||
n: "noun",
|
||||
nom: "noun", // French
|
||||
verb: "verb",
|
||||
verbs: "verb",
|
||||
v: "verb",
|
||||
v1: "verb",
|
||||
adjective: "adjective",
|
||||
adjektiv: "adjective", // German
|
||||
adj: "adjective",
|
||||
adverb: "adverb",
|
||||
adverbs: "adverb",
|
||||
adv: "adverb",
|
||||
};
|
||||
|
||||
const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]);
|
||||
|
||||
const PATHS = {
|
||||
omw: "stage-1-extract/output/omw.json",
|
||||
cefrDir: "stage-2-annotate/sources/cefr",
|
||||
outputDir: "stage-2-annotate/output",
|
||||
};
|
||||
|
||||
// ── CEFR source loading ───────────────────────────────────────────────────────
|
||||
|
||||
type CefrIndex = Map<string, { level: string; example?: string }>;
|
||||
|
||||
async function loadCefrSource(
|
||||
lang: SupportedLanguageCode,
|
||||
): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> {
|
||||
const filepath = path.join(PATHS.cefrDir, `${lang}.json`);
|
||||
const raw = await fs.readFile(filepath, "utf-8");
|
||||
const entries = JSON.parse(raw) as CefrSourceEntry[];
|
||||
|
||||
// First pass — detect conflicts.
|
||||
// Structure: "word|pos" -> Set of CEFR levels seen
|
||||
const seen = new Map<string, Set<string>>();
|
||||
|
||||
for (const entry of entries) {
|
||||
const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
|
||||
if (!pos) continue;
|
||||
if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
|
||||
|
||||
const key = `${entry.word.toLowerCase().trim()}|${pos}`;
|
||||
if (!seen.has(key)) seen.set(key, new Set());
|
||||
seen.get(key)!.add(entry.cefr_level);
|
||||
}
|
||||
|
||||
const conflicts: ConflictEntry[] = [];
|
||||
for (const [key, levels] of seen.entries()) {
|
||||
if (levels.size > 1) {
|
||||
const [word, pos] = key.split("|") as [string, string];
|
||||
conflicts.push({ word, pos, language: lang, levels: [...levels] });
|
||||
}
|
||||
}
|
||||
|
||||
// Second pass — build index, skip conflicting entries.
|
||||
const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`));
|
||||
|
||||
const index: CefrIndex = new Map();
|
||||
for (const entry of entries) {
|
||||
const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
|
||||
if (!pos) continue;
|
||||
if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
|
||||
|
||||
const key = `${entry.word.toLowerCase().trim()}|${pos}`;
|
||||
if (conflictKeys.has(key)) continue;
|
||||
|
||||
index.set(key, {
|
||||
level: entry.cefr_level,
|
||||
...(entry.example_sentence_native
|
||||
? { example: entry.example_sentence_native }
|
||||
: {}),
|
||||
});
|
||||
}
|
||||
|
||||
return { index, conflicts };
|
||||
}
|
||||
|
||||
// ── Annotation ────────────────────────────────────────────────────────────────
|
||||
|
||||
async function annotate(): Promise<void> {
|
||||
// Load OMW records
|
||||
console.log("Reading OMW extract...");
|
||||
const raw = await fs.readFile(PATHS.omw, "utf-8");
|
||||
const omwRecords = JSON.parse(raw) as OmwRecord[];
|
||||
console.log(` Loaded ${omwRecords.length.toLocaleString()} synsets`);
|
||||
|
||||
// Load CEFR sources for all languages
|
||||
console.log("\nLoading CEFR source files...");
|
||||
const cefrIndexes = new Map<SupportedLanguageCode, CefrIndex>();
|
||||
const allConflicts: ConflictEntry[] = [];
|
||||
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
const { index, conflicts } = await loadCefrSource(lang);
|
||||
cefrIndexes.set(lang, index);
|
||||
allConflicts.push(...conflicts);
|
||||
console.log(
|
||||
` ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`,
|
||||
);
|
||||
}
|
||||
|
||||
// Write conflicts file
|
||||
await fs.mkdir(PATHS.outputDir, { recursive: true });
|
||||
await fs.writeFile(
|
||||
path.join(PATHS.outputDir, "conflicts.json"),
|
||||
JSON.stringify(allConflicts, null, 2),
|
||||
"utf-8",
|
||||
);
|
||||
console.log(
|
||||
`\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`,
|
||||
);
|
||||
|
||||
// Annotate and write one file per language
|
||||
console.log("\nAnnotating...");
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
const index = cefrIndexes.get(lang)!;
|
||||
const records: AnnotatedRecord[] = [];
|
||||
let matched = 0;
|
||||
|
||||
for (const record of omwRecords) {
|
||||
const annotated: AnnotatedRecord = {
|
||||
source_id: record.source_id,
|
||||
pos: record.pos,
|
||||
translations: record.translations,
|
||||
glosses: record.glosses,
|
||||
examples: {},
|
||||
votes: {},
|
||||
};
|
||||
|
||||
// Convert OMW examples to typed format
|
||||
for (const [l, exList] of Object.entries(record.examples)) {
|
||||
annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({
|
||||
text,
|
||||
source: "omw" as const,
|
||||
}));
|
||||
}
|
||||
|
||||
// Match translations for this language against CEFR index
|
||||
const langTranslations = record.translations[lang] ?? [];
|
||||
for (const word of langTranslations) {
|
||||
const key = `${word.toLowerCase().trim()}|${record.pos}`;
|
||||
const cefrEntry = index.get(key);
|
||||
if (!cefrEntry) continue;
|
||||
|
||||
matched++;
|
||||
|
||||
// Add CEFR vote
|
||||
if (!annotated.votes[lang]) annotated.votes[lang] = {};
|
||||
annotated.votes[lang]![word] = { cefr_source: cefrEntry.level };
|
||||
|
||||
// Add native example if present
|
||||
if (cefrEntry.example) {
|
||||
if (!annotated.examples[lang]) annotated.examples[lang] = [];
|
||||
annotated.examples[lang]!.push({
|
||||
text: cefrEntry.example,
|
||||
source: "cefr" as const,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
records.push(annotated);
|
||||
}
|
||||
|
||||
const outputFile = path.join(PATHS.outputDir, `${lang}.json`);
|
||||
await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8");
|
||||
console.log(
|
||||
` ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
annotate().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue