feat(pipeline): add annotate stage
- write annotate.ts — matches CEFR source files against OMW translations - match by word text + normalized POS - add cefr_source vote to matched translations - extract native example sentences from CEFR source files - write one annotated JSON per language to stage-2-annotate/output/ - write conflicts.json for words with multiple CEFR levels - update tsconfig to support all stage directories - 2 German conflicts found (macht, bleiche) - match rates: en 47k, fr 44k, de 26k, it 26k, es 26k
This commit is contained in:
parent
9ea35568e5
commit
214a597e99
1 changed files with 227 additions and 0 deletions
227
data-pipeline/stage-2-annotate/scripts/annotate.ts
Normal file
227
data-pipeline/stage-2-annotate/scripts/annotate.ts
Normal file
|
|
@ -0,0 +1,227 @@
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import path from "node:path";
|
||||||
|
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||||
|
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
||||||
|
|
||||||
|
// ── Types ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
type OmwExample = { text: string; source: "omw" };
|
||||||
|
|
||||||
|
type CefrExample = { text: string; source: "cefr" };
|
||||||
|
|
||||||
|
type Example = OmwExample | CefrExample;
|
||||||
|
|
||||||
|
type OmwRecord = {
|
||||||
|
source_id: string;
|
||||||
|
pos: SupportedPos;
|
||||||
|
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||||
|
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||||
|
examples: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
type AnnotatedRecord = {
|
||||||
|
source_id: string;
|
||||||
|
pos: SupportedPos;
|
||||||
|
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||||
|
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||||
|
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
||||||
|
votes: Partial<
|
||||||
|
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
||||||
|
>;
|
||||||
|
};
|
||||||
|
|
||||||
|
type CefrSourceEntry = {
|
||||||
|
word: string;
|
||||||
|
pos: string;
|
||||||
|
cefr_level: string;
|
||||||
|
example_sentence_native?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
type ConflictEntry = {
|
||||||
|
word: string;
|
||||||
|
pos: string;
|
||||||
|
language: SupportedLanguageCode;
|
||||||
|
levels: string[];
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Constants ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const POS_NORMALIZE: Record<string, SupportedPos> = {
|
||||||
|
noun: "noun",
|
||||||
|
n: "noun",
|
||||||
|
nom: "noun", // French
|
||||||
|
verb: "verb",
|
||||||
|
verbs: "verb",
|
||||||
|
v: "verb",
|
||||||
|
v1: "verb",
|
||||||
|
adjective: "adjective",
|
||||||
|
adjektiv: "adjective", // German
|
||||||
|
adj: "adjective",
|
||||||
|
adverb: "adverb",
|
||||||
|
adverbs: "adverb",
|
||||||
|
adv: "adverb",
|
||||||
|
};
|
||||||
|
|
||||||
|
const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]);
|
||||||
|
|
||||||
|
const PATHS = {
|
||||||
|
omw: "stage-1-extract/output/omw.json",
|
||||||
|
cefrDir: "stage-2-annotate/sources/cefr",
|
||||||
|
outputDir: "stage-2-annotate/output",
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── CEFR source loading ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
type CefrIndex = Map<string, { level: string; example?: string }>;
|
||||||
|
|
||||||
|
async function loadCefrSource(
|
||||||
|
lang: SupportedLanguageCode,
|
||||||
|
): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> {
|
||||||
|
const filepath = path.join(PATHS.cefrDir, `${lang}.json`);
|
||||||
|
const raw = await fs.readFile(filepath, "utf-8");
|
||||||
|
const entries = JSON.parse(raw) as CefrSourceEntry[];
|
||||||
|
|
||||||
|
// First pass — detect conflicts.
|
||||||
|
// Structure: "word|pos" -> Set of CEFR levels seen
|
||||||
|
const seen = new Map<string, Set<string>>();
|
||||||
|
|
||||||
|
for (const entry of entries) {
|
||||||
|
const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
|
||||||
|
if (!pos) continue;
|
||||||
|
if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
|
||||||
|
|
||||||
|
const key = `${entry.word.toLowerCase().trim()}|${pos}`;
|
||||||
|
if (!seen.has(key)) seen.set(key, new Set());
|
||||||
|
seen.get(key)!.add(entry.cefr_level);
|
||||||
|
}
|
||||||
|
|
||||||
|
const conflicts: ConflictEntry[] = [];
|
||||||
|
for (const [key, levels] of seen.entries()) {
|
||||||
|
if (levels.size > 1) {
|
||||||
|
const [word, pos] = key.split("|") as [string, string];
|
||||||
|
conflicts.push({ word, pos, language: lang, levels: [...levels] });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second pass — build index, skip conflicting entries.
|
||||||
|
const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`));
|
||||||
|
|
||||||
|
const index: CefrIndex = new Map();
|
||||||
|
for (const entry of entries) {
|
||||||
|
const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
|
||||||
|
if (!pos) continue;
|
||||||
|
if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
|
||||||
|
|
||||||
|
const key = `${entry.word.toLowerCase().trim()}|${pos}`;
|
||||||
|
if (conflictKeys.has(key)) continue;
|
||||||
|
|
||||||
|
index.set(key, {
|
||||||
|
level: entry.cefr_level,
|
||||||
|
...(entry.example_sentence_native
|
||||||
|
? { example: entry.example_sentence_native }
|
||||||
|
: {}),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return { index, conflicts };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Annotation ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async function annotate(): Promise<void> {
|
||||||
|
// Load OMW records
|
||||||
|
console.log("Reading OMW extract...");
|
||||||
|
const raw = await fs.readFile(PATHS.omw, "utf-8");
|
||||||
|
const omwRecords = JSON.parse(raw) as OmwRecord[];
|
||||||
|
console.log(` Loaded ${omwRecords.length.toLocaleString()} synsets`);
|
||||||
|
|
||||||
|
// Load CEFR sources for all languages
|
||||||
|
console.log("\nLoading CEFR source files...");
|
||||||
|
const cefrIndexes = new Map<SupportedLanguageCode, CefrIndex>();
|
||||||
|
const allConflicts: ConflictEntry[] = [];
|
||||||
|
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const { index, conflicts } = await loadCefrSource(lang);
|
||||||
|
cefrIndexes.set(lang, index);
|
||||||
|
allConflicts.push(...conflicts);
|
||||||
|
console.log(
|
||||||
|
` ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write conflicts file
|
||||||
|
await fs.mkdir(PATHS.outputDir, { recursive: true });
|
||||||
|
await fs.writeFile(
|
||||||
|
path.join(PATHS.outputDir, "conflicts.json"),
|
||||||
|
JSON.stringify(allConflicts, null, 2),
|
||||||
|
"utf-8",
|
||||||
|
);
|
||||||
|
console.log(
|
||||||
|
`\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Annotate and write one file per language
|
||||||
|
console.log("\nAnnotating...");
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const index = cefrIndexes.get(lang)!;
|
||||||
|
const records: AnnotatedRecord[] = [];
|
||||||
|
let matched = 0;
|
||||||
|
|
||||||
|
for (const record of omwRecords) {
|
||||||
|
const annotated: AnnotatedRecord = {
|
||||||
|
source_id: record.source_id,
|
||||||
|
pos: record.pos,
|
||||||
|
translations: record.translations,
|
||||||
|
glosses: record.glosses,
|
||||||
|
examples: {},
|
||||||
|
votes: {},
|
||||||
|
};
|
||||||
|
|
||||||
|
// Convert OMW examples to typed format
|
||||||
|
for (const [l, exList] of Object.entries(record.examples)) {
|
||||||
|
annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({
|
||||||
|
text,
|
||||||
|
source: "omw" as const,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Match translations for this language against CEFR index
|
||||||
|
const langTranslations = record.translations[lang] ?? [];
|
||||||
|
for (const word of langTranslations) {
|
||||||
|
const key = `${word.toLowerCase().trim()}|${record.pos}`;
|
||||||
|
const cefrEntry = index.get(key);
|
||||||
|
if (!cefrEntry) continue;
|
||||||
|
|
||||||
|
matched++;
|
||||||
|
|
||||||
|
// Add CEFR vote
|
||||||
|
if (!annotated.votes[lang]) annotated.votes[lang] = {};
|
||||||
|
annotated.votes[lang]![word] = { cefr_source: cefrEntry.level };
|
||||||
|
|
||||||
|
// Add native example if present
|
||||||
|
if (cefrEntry.example) {
|
||||||
|
if (!annotated.examples[lang]) annotated.examples[lang] = [];
|
||||||
|
annotated.examples[lang]!.push({
|
||||||
|
text: cefrEntry.example,
|
||||||
|
source: "cefr" as const,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
records.push(annotated);
|
||||||
|
}
|
||||||
|
|
||||||
|
const outputFile = path.join(PATHS.outputDir, `${lang}.json`);
|
||||||
|
await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8");
|
||||||
|
console.log(
|
||||||
|
` ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
annotate().catch((err) => {
|
||||||
|
console.error(err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
Loading…
Add table
Add a link
Reference in a new issue