diff --git a/data-pipeline/package.json b/data-pipeline/package.json index 1d27a98..a4b4523 100644 --- a/data-pipeline/package.json +++ b/data-pipeline/package.json @@ -3,7 +3,13 @@ "version": "1.0.0", "private": true, "type": "module", - "scripts": {}, + "scripts": { + "extract": "tsx scripts/extract.ts", + "annotate": "tsx scripts/annotate.ts", + "enrich": "tsx scripts/enrich.ts", + "merge": "tsx scripts/merge.ts", + "compare": "tsx scripts/compare.ts" + }, "dependencies": { "@lila/shared": "workspace:*", "better-sqlite3": "^12.9.0" diff --git a/data-pipeline/stage-2-annotate/scripts/annotate.ts b/data-pipeline/stage-2-annotate/scripts/annotate.ts deleted file mode 100644 index bb71f60..0000000 --- a/data-pipeline/stage-2-annotate/scripts/annotate.ts +++ /dev/null @@ -1,227 +0,0 @@ -import fs from "node:fs/promises"; -import path from "node:path"; -import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; -import type { SupportedLanguageCode, SupportedPos } from "@lila/shared"; - -// ── Types ──────────────────────────────────────────────────────────────────── - -type OmwExample = { text: string; source: "omw" }; - -type CefrExample = { text: string; source: "cefr" }; - -type Example = OmwExample | CefrExample; - -type OmwRecord = { - source_id: string; - pos: SupportedPos; - translations: Partial>; - glosses: Partial>; - examples: Partial>; -}; - -type AnnotatedRecord = { - source_id: string; - pos: SupportedPos; - translations: Partial>; - glosses: Partial>; - examples: Partial>; - votes: Partial< - Record> - >; -}; - -type CefrSourceEntry = { - word: string; - pos: string; - cefr_level: string; - example_sentence_native?: string; -}; - -type ConflictEntry = { - word: string; - pos: string; - language: SupportedLanguageCode; - levels: string[]; -}; - -// ── Constants ───────────────────────────────────────────────────────────────── - -const POS_NORMALIZE: Record = { - noun: "noun", - n: "noun", - nom: "noun", // French - verb: "verb", - verbs: "verb", - v: "verb", - v1: "verb", - adjective: "adjective", - adjektiv: "adjective", // German - adj: "adjective", - adverb: "adverb", - adverbs: "adverb", - adv: "adverb", -}; - -const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]); - -const PATHS = { - omw: "stage-1-extract/output/omw.json", - cefrDir: "stage-2-annotate/sources/cefr", - outputDir: "stage-2-annotate/output", -}; - -// ── CEFR source loading ─────────────────────────────────────────────────────── - -type CefrIndex = Map; - -async function loadCefrSource( - lang: SupportedLanguageCode, -): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> { - const filepath = path.join(PATHS.cefrDir, `${lang}.json`); - const raw = await fs.readFile(filepath, "utf-8"); - const entries = JSON.parse(raw) as CefrSourceEntry[]; - - // First pass — detect conflicts. - // Structure: "word|pos" -> Set of CEFR levels seen - const seen = new Map>(); - - for (const entry of entries) { - const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()]; - if (!pos) continue; - if (!CEFR_LEVELS.has(entry.cefr_level)) continue; - - const key = `${entry.word.toLowerCase().trim()}|${pos}`; - if (!seen.has(key)) seen.set(key, new Set()); - seen.get(key)!.add(entry.cefr_level); - } - - const conflicts: ConflictEntry[] = []; - for (const [key, levels] of seen.entries()) { - if (levels.size > 1) { - const [word, pos] = key.split("|") as [string, string]; - conflicts.push({ word, pos, language: lang, levels: [...levels] }); - } - } - - // Second pass — build index, skip conflicting entries. - const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`)); - - const index: CefrIndex = new Map(); - for (const entry of entries) { - const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()]; - if (!pos) continue; - if (!CEFR_LEVELS.has(entry.cefr_level)) continue; - - const key = `${entry.word.toLowerCase().trim()}|${pos}`; - if (conflictKeys.has(key)) continue; - - index.set(key, { - level: entry.cefr_level, - ...(entry.example_sentence_native - ? { example: entry.example_sentence_native } - : {}), - }); - } - - return { index, conflicts }; -} - -// ── Annotation ──────────────────────────────────────────────────────────────── - -async function annotate(): Promise { - // Load OMW records - console.log("Reading OMW extract..."); - const raw = await fs.readFile(PATHS.omw, "utf-8"); - const omwRecords = JSON.parse(raw) as OmwRecord[]; - console.log(` Loaded ${omwRecords.length.toLocaleString()} synsets`); - - // Load CEFR sources for all languages - console.log("\nLoading CEFR source files..."); - const cefrIndexes = new Map(); - const allConflicts: ConflictEntry[] = []; - - for (const lang of SUPPORTED_LANGUAGE_CODES) { - const { index, conflicts } = await loadCefrSource(lang); - cefrIndexes.set(lang, index); - allConflicts.push(...conflicts); - console.log( - ` ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`, - ); - } - - // Write conflicts file - await fs.mkdir(PATHS.outputDir, { recursive: true }); - await fs.writeFile( - path.join(PATHS.outputDir, "conflicts.json"), - JSON.stringify(allConflicts, null, 2), - "utf-8", - ); - console.log( - `\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`, - ); - - // Annotate and write one file per language - console.log("\nAnnotating..."); - for (const lang of SUPPORTED_LANGUAGE_CODES) { - const index = cefrIndexes.get(lang)!; - const records: AnnotatedRecord[] = []; - let matched = 0; - - for (const record of omwRecords) { - const annotated: AnnotatedRecord = { - source_id: record.source_id, - pos: record.pos, - translations: record.translations, - glosses: record.glosses, - examples: {}, - votes: {}, - }; - - // Convert OMW examples to typed format - for (const [l, exList] of Object.entries(record.examples)) { - annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({ - text, - source: "omw" as const, - })); - } - - // Match translations for this language against CEFR index - const langTranslations = record.translations[lang] ?? []; - for (const word of langTranslations) { - const key = `${word.toLowerCase().trim()}|${record.pos}`; - const cefrEntry = index.get(key); - if (!cefrEntry) continue; - - matched++; - - // Add CEFR vote - if (!annotated.votes[lang]) annotated.votes[lang] = {}; - annotated.votes[lang]![word] = { cefr_source: cefrEntry.level }; - - // Add native example if present - if (cefrEntry.example) { - if (!annotated.examples[lang]) annotated.examples[lang] = []; - annotated.examples[lang]!.push({ - text: cefrEntry.example, - source: "cefr" as const, - }); - } - } - - records.push(annotated); - } - - const outputFile = path.join(PATHS.outputDir, `${lang}.json`); - await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8"); - console.log( - ` ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`, - ); - } -} - -// ── Main ───────────────────────────────────────────────────────────────────── - -annotate().catch((err) => { - console.error(err); - process.exit(1); -}); diff --git a/data-pipeline/test/output/sample.json b/data-pipeline/test/output/sample.json deleted file mode 100644 index 5dd774f..0000000 --- a/data-pipeline/test/output/sample.json +++ /dev/null @@ -1,4492 +0,0 @@ -[ - { - "source_id": "ili:i90862", - "pos": "noun", - "translations": { - "en": [ - "kinsman" - ], - "es": [ - "pariente" - ], - "de": [ - "Gevatter", - "Anverwandter", - "Familienmitglied", - "Verwandter", - "Familienangehöriger", - "Angehöriger", - "Verwandte" - ], - "fr": [ - "parent" - ] - }, - "glosses": { - "en": [ - "a male relative" - ], - "de": [ - "ein männlicher Verwandter" - ] - }, - "examples": { - "de": [ - { - "text": "Jedes Familienmitglied hat seine Aufgaben.", - "source": "cefr" - }, - { - "text": "Er ist ein entfernter Verwandter von mir.", - "source": "cefr" - }, - { - "text": "Alle Familienangehörigen kamen zum Treffen.", - "source": "cefr" - }, - { - "text": "Er ist ein Angehöriger der Familie.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "Ses parents sont très fiers de lui.", - "source": "cefr" - } - ], - "es": [ - { - "text": "Tengo muchos parientes viviendo en esta ciudad.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "kinsman": { - "cefr_source": "C1" - } - }, - "de": { - "Familienmitglied": { - "cefr_source": "A2" - }, - "Verwandter": { - "cefr_source": "B1" - }, - "Familienangehöriger": { - "cefr_source": "B1" - }, - "Angehöriger": { - "cefr_source": "B2" - } - }, - "fr": { - "parent": { - "cefr_source": "A1" - } - }, - "es": { - "pariente": { - "cefr_source": "A2" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i23087", - "pos": "verb", - "translations": { - "en": [ - "teach" - ], - "it": [ - "addestrare", - "ammaestrare", - "insegnare" - ], - "es": [ - "enseñar" - ], - "fr": [ - "enseigner", - "apprendre", - "guider" - ] - }, - "glosses": { - "en": [ - "accustom gradually to some action or attitude" - ] - }, - "examples": { - "en": [ - { - "text": "The child is taught to obey her parents", - "source": "omw" - } - ], - "it": [ - { - "text": "Stiamo addestrando il nostro cane.", - "source": "cefr" - }, - { - "text": "Lei insegna italiano ai bambini.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "Elle enseigne le français au lycée.", - "source": "cefr" - }, - { - "text": "J'apprends le français.", - "source": "cefr" - }, - { - "text": "Il va nous guider à travers la forêt.", - "source": "cefr" - } - ], - "es": [ - { - "text": "Ella enseña español en la universidad.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "teach": { - "cefr_source": "A1" - } - }, - "it": { - "addestrare": { - "cefr_source": "B1" - }, - "insegnare": { - "cefr_source": "A1" - } - }, - "fr": { - "enseigner": { - "cefr_source": "A2" - }, - "apprendre": { - "cefr_source": "A1" - }, - "guider": { - "cefr_source": "A2" - } - }, - "es": { - "enseñar": { - "cefr_source": "A1" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i26718", - "pos": "verb", - "translations": { - "en": [ - "dub", - "nickname" - ], - "it": [ - "battezzare", - "cognominare", - "doppiare", - "soprannominare" - ], - "es": [ - "apodar" - ], - "fr": [ - "surnom", - "baptiser" - ] - }, - "glosses": { - "en": [ - "give a nickname to" - ] - }, - "examples": { - "it": [ - { - "text": "Hanno deciso di battezzare il loro figlio la prossima primavera.", - "source": "cefr" - }, - { - "text": "Lo hanno soprannominato 'il Professore'.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "Ils ont décidé de baptiser leur enfant Marie.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "dub": { - "cefr_source": "B2" - } - }, - "it": { - "battezzare": { - "cefr_source": "B1" - }, - "soprannominare": { - "cefr_source": "B2" - } - }, - "fr": { - "baptiser": { - "cefr_source": "B1" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i4448", - "pos": "adjective", - "translations": { - "en": [ - "drab", - "dreary" - ], - "es": [ - "igual", - "rutinario" - ], - "fr": [ - "morne", - "maussade", - "sombre" - ] - }, - "glosses": { - "en": [ - "lacking in liveliness or charm or surprise" - ] - }, - "examples": { - "en": [ - { - "text": "her drab personality", - "source": "omw" - }, - { - "text": "life was drab compared with the more exciting life style overseas", - "source": "omw" - }, - { - "text": "a series of dreary dinner parties", - "source": "omw" - } - ], - "fr": [ - { - "text": "Le temps était morne et pluvieux.", - "source": "cefr" - }, - { - "text": "Le temps était maussade toute la journée.", - "source": "cefr" - }, - { - "text": "La pièce était sombre sans lumière.", - "source": "cefr" - } - ], - "es": [ - { - "text": "Todos somos iguales.", - "source": "cefr" - }, - { - "text": "Su trabajo se ha vuelto muy rutinario.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "drab": { - "cefr_source": "B2" - }, - "dreary": { - "cefr_source": "B2" - } - }, - "fr": { - "morne": { - "cefr_source": "B2" - }, - "maussade": { - "cefr_source": "B2" - }, - "sombre": { - "cefr_source": "B1" - } - }, - "es": { - "igual": { - "cefr_source": "A2" - }, - "rutinario": { - "cefr_source": "B1" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i85845", - "pos": "noun", - "translations": { - "en": [ - "natural depression", - "depression" - ], - "it": [ - "avvallamento" - ], - "es": [ - "depresión", - "depresión natural" - ], - "fr": [ - "dépression" - ] - }, - "glosses": { - "en": [ - "a sunken or depressed geological formation" - ] - }, - "examples": { - "fr": [ - { - "text": "Elle souffre de dépression.", - "source": "cefr" - } - ], - "es": [ - { - "text": "La depresión es una enfermedad grave.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "depression": { - "cefr_source": "B2" - } - }, - "fr": { - "dépression": { - "cefr_source": "B2" - } - }, - "es": { - "depresión": { - "cefr_source": "B1" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i27202", - "pos": "verb", - "translations": { - "en": [ - "jump" - ], - "fr": [ - "sauter" - ] - }, - "glosses": { - "en": [ - "make a sudden physical attack on" - ] - }, - "examples": { - "en": [ - { - "text": "The muggers jumped the woman in the fur coat", - "source": "omw" - } - ], - "fr": [ - { - "text": "Le chien aime sauter par-dessus la clôture.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "jump": { - "cefr_source": "A1" - } - }, - "fr": { - "sauter": { - "cefr_source": "A2" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i27830", - "pos": "verb", - "translations": { - "en": [ - "run into", - "bump into", - "jar against", - "butt against", - "knock against" - ], - "it": [ - "urtare" - ], - "es": [ - "chocar", - "colisionar", - "golpearse contra", - "topar" - ], - "de": [ - "anraunzen", - "anfahren", - "anschnauzen", - "ankläffen", - "anschreien", - "anblaffen", - "anblaffen", - "anbelfern", - "anbrüllen", - "anbellen" - ] - }, - "glosses": { - "en": [ - "collide violently with an obstacle" - ], - "de": [ - "heftig mit einem Hindernis zusammenstoßen" - ] - }, - "examples": { - "en": [ - { - "text": "I ran into the telephone pole", - "source": "omw" - } - ], - "it": [ - { - "text": "Ho urtato il tavolo con il gomito.", - "source": "cefr" - } - ], - "de": [ - { - "text": "Der Bus fuhr an die Haltestelle an.", - "source": "cefr" - }, - { - "text": "Er hat mich ohne Grund angeschrien.", - "source": "cefr" - } - ], - "es": [ - { - "text": "El coche chocó contra un árbol.", - "source": "cefr" - }, - { - "text": "Me topé con un viejo amigo en la calle.", - "source": "cefr" - } - ] - }, - "votes": { - "it": { - "urtare": { - "cefr_source": "B1" - } - }, - "de": { - "anfahren": { - "cefr_source": "B1" - }, - "anschreien": { - "cefr_source": "B1" - } - }, - "es": { - "chocar": { - "cefr_source": "A2" - }, - "topar": { - "cefr_source": "B1" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i27676", - "pos": "verb", - "translations": { - "en": [ - "fumble" - ] - }, - "glosses": { - "en": [ - "handle clumsily" - ] - }, - "examples": {}, - "votes": { - "en": { - "fumble": { - "cefr_source": "B2" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i30768", - "pos": "verb", - "translations": { - "en": [ - "attract", - "appeal" - ], - "it": [ - "allettare", - "attirare", - "attrarre" - ], - "es": [ - "atraer" - ], - "de": [ - "anziehen", - "etwas überziehen", - "einkleiden", - "etwas überstreifen", - "bekleiden", - "hineinschlüpfen", - "überstülpen", - "ankleiden", - "Kleidung anlegen" - ], - "fr": [ - "allécher", - "attirer" - ] - }, - "glosses": { - "en": [ - "be attractive to" - ], - "de": [ - "ein Kleidungsstück in der dafür vorgesehenen Weise auf den Körper bringen" - ] - }, - "examples": { - "en": [ - { - "text": "The idea of a vacation appeals to me", - "source": "omw" - }, - { - "text": "The beautiful garden attracted many people", - "source": "omw" - } - ], - "de": [ - { - "text": "Sie zog sich das Kleid an.", - "source": "omw" - } - ], - "it": [ - { - "text": "Il nuovo negozio attira molti clienti.", - "source": "cefr" - }, - { - "text": "Il magnete attrae il metallo.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "La promesse d'un salaire élevé a alléché de nombreux candidats.", - "source": "cefr" - }, - { - "text": "Cette publicité attire l'attention.", - "source": "cefr" - } - ], - "es": [ - { - "text": "El imán atrae el metal.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "attract": { - "cefr_source": "B1" - } - }, - "it": { - "attirare": { - "cefr_source": "B2" - }, - "attrarre": { - "cefr_source": "B1" - } - }, - "de": { - "anziehen": { - "cefr_source": "A2" - }, - "bekleiden": { - "cefr_source": "B2" - } - }, - "fr": { - "allécher": { - "cefr_source": "C1" - }, - "attirer": { - "cefr_source": "B1" - } - }, - "es": { - "atraer": { - "cefr_source": "B2" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i112909", - "pos": "noun", - "translations": { - "en": [ - "regulation" - ], - "es": [ - "reglamento" - ], - "fr": [ - "réglementation", - "gouvernement", - "tenue" - ] - }, - "glosses": { - "en": [ - "the state of being controlled or governed" - ] - }, - "examples": { - "fr": [ - { - "text": "La nouvelle réglementation est très stricte.", - "source": "cefr" - }, - { - "text": "Le gouvernement a annoncé de nouvelles mesures.", - "source": "cefr" - }, - { - "text": "Elle a choisi une tenue élégante pour la soirée.", - "source": "cefr" - } - ], - "es": [ - { - "text": "Debemos seguir el reglamento.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "regulation": { - "cefr_source": "B2" - } - }, - "fr": { - "réglementation": { - "cefr_source": "B2" - }, - "gouvernement": { - "cefr_source": "B1" - }, - "tenue": { - "cefr_source": "B1" - } - }, - "es": { - "reglamento": { - "cefr_source": "B2" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i46846", - "pos": "noun", - "translations": { - "en": [ - "ladybug", - "ladybeetle", - "lady beetle", - "ladybird", - "ladybird beetle" - ], - "it": [ - "coccinella" - ], - "fr": [ - "coccinelle" - ] - }, - "glosses": { - "en": [ - "small round bright-colored and spotted beetle that usually feeds on aphids and other insect pests" - ] - }, - "examples": { - "fr": [ - { - "text": "Une coccinelle s'est posée sur ma main.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "ladybug": { - "cefr_source": "A2" - } - }, - "fr": { - "coccinelle": { - "cefr_source": "A2" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i15517", - "pos": "adjective", - "translations": { - "en": [ - "judicial" - ], - "it": [ - "giudiziale", - "giudiziario" - ], - "es": [ - "judicial" - ], - "de": [ - "durch einen Richter", - "durch ein Gericht", - "durch den Richter", - "richterlich" - ], - "fr": [ - "judiciaire" - ] - }, - "glosses": { - "en": [ - "belonging or appropriate to the office of a judge" - ], - "de": [ - "zum Amt eines Richters gehörend oder diesem zugehörig" - ] - }, - "examples": { - "en": [ - { - "text": "judicial robes", - "source": "omw" - } - ], - "it": [ - { - "text": "Hanno avviato un'azione giudiziale.", - "source": "cefr" - }, - { - "text": "Il sistema giudiziario italiano è complesso.", - "source": "cefr" - } - ], - "de": [ - { - "text": "Es bedarf einer richterlichen Anordnung.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "L'affaire est en cours de procédure judiciaire.", - "source": "cefr" - } - ], - "es": [ - { - "text": "El proceso judicial fue largo.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "judicial": { - "cefr_source": "C1" - } - }, - "it": { - "giudiziale": { - "cefr_source": "C1" - }, - "giudiziario": { - "cefr_source": "C1" - } - }, - "de": { - "richterlich": { - "cefr_source": "C1" - } - }, - "fr": { - "judiciaire": { - "cefr_source": "B2" - } - }, - "es": { - "judicial": { - "cefr_source": "C1" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i11095", - "pos": "adjective", - "translations": { - "en": [ - "poor" - ], - "es": [ - "pobre" - ], - "fr": [ - "pauvre" - ] - }, - "glosses": { - "en": [ - "characterized by or indicating poverty" - ] - }, - "examples": { - "en": [ - { - "text": "the country had a poor economy", - "source": "omw" - }, - { - "text": "they lived in the poor section of town", - "source": "omw" - } - ], - "fr": [ - { - "text": "Il est très pauvre.", - "source": "cefr" - } - ], - "es": [ - { - "text": "Es un hombre muy pobre.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "poor": { - "cefr_source": "A2" - } - }, - "fr": { - "pauvre": { - "cefr_source": "A1" - } - }, - "es": { - "pobre": { - "cefr_source": "A1" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i62321", - "pos": "noun", - "translations": { - "en": [ - "flashiness", - "garishness", - "gaudiness", - "loudness", - "brashness", - "meretriciousness", - "tawdriness", - "glitz" - ], - "it": [ - "pacchianeria", - "vistosità" - ], - "es": [ - "astracanada", - "chabacanería", - "garrulería", - "horterada", - "mal gusto", - "ordinariez", - "zafiedad" - ], - "de": [ - "Aufdringlichkeit", - "Zudringlichkeit", - "Penetranz" - ], - "fr": [ - "culot" - ] - }, - "glosses": { - "en": [ - "tasteless showiness" - ], - "de": [ - "geschmacklose Aufdringlichkeit" - ] - }, - "examples": { - "fr": [ - { - "text": "Il a eu le culot de me demander de l'argent après tout ça.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "loudness": { - "cefr_source": "B2" - }, - "glitz": { - "cefr_source": "B2" - } - }, - "fr": { - "culot": { - "cefr_source": "B2" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i22613", - "pos": "verb", - "translations": { - "en": [ - "scavenge", - "clean" - ], - "es": [ - "limpiar" - ], - "fr": [ - "nettoyer" - ] - }, - "glosses": { - "en": [ - "remove unwanted substances from" - ] - }, - "examples": { - "fr": [ - { - "text": "Je dois nettoyer ma chambre.", - "source": "cefr" - } - ], - "es": [ - { - "text": "Necesito limpiar mi habitación.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "scavenge": { - "cefr_source": "B2" - } - }, - "fr": { - "nettoyer": { - "cefr_source": "A1" - } - }, - "es": { - "limpiar": { - "cefr_source": "A1" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i4857", - "pos": "adjective", - "translations": { - "en": [ - "enthusiastic" - ], - "it": [ - "caloroso", - "entusiastico", - "fervido", - "entusiasta" - ], - "fr": [ - "courageux", - "enthousiaste" - ] - }, - "glosses": { - "en": [ - "having or showing great excitement and interest" - ] - }, - "examples": { - "en": [ - { - "text": "enthusiastic crowds filled the streets", - "source": "omw" - }, - { - "text": "an enthusiastic response", - "source": "omw" - }, - { - "text": "was enthusiastic about taking ballet lessons", - "source": "omw" - } - ], - "it": [ - { - "text": "Abbiamo ricevuto un'accoglienza molto calorosa.", - "source": "cefr" - }, - { - "text": "Ha espresso un fervido desiderio di pace.", - "source": "cefr" - }, - { - "text": "Era molto entusiasta del nuovo progetto.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "C'est une personne très courageuse.", - "source": "cefr" - }, - { - "text": "Elle est très enthousiaste à l'idée de ce voyage.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "enthusiastic": { - "cefr_source": "B1" - } - }, - "it": { - "caloroso": { - "cefr_source": "B1" - }, - "fervido": { - "cefr_source": "C1" - }, - "entusiasta": { - "cefr_source": "B1" - } - }, - "fr": { - "courageux": { - "cefr_source": "A2" - }, - "enthousiaste": { - "cefr_source": "B1" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i104521", - "pos": "noun", - "translations": { - "en": [ - "veronica", - "speedwell" - ], - "it": [ - "veronica" - ], - "de": [ - "Allerweltsheil", - "Grundheil", - "Ehrenpreis", - "Männertreu", - "Köhlerkraut", - "Schlangenkraut" - ], - "fr": [ - "veronica", - "véronique" - ] - }, - "glosses": { - "en": [ - "any plant of the genus Veronica" - ], - "de": [ - "jede Pflanze der Gattung Veronica" - ] - }, - "examples": { - "de": [ - { - "text": "Er erhielt den Ehrenpreis für sein Lebenswerk.", - "source": "cefr" - } - ] - }, - "votes": { - "de": { - "Ehrenpreis": { - "cefr_source": "C1" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i958", - "pos": "adjective", - "translations": { - "en": [ - "gracious" - ], - "es": [ - "amable" - ] - }, - "glosses": { - "en": [ - "disposed to bestow favors" - ] - }, - "examples": { - "en": [ - { - "text": "thanks to the gracious gods", - "source": "omw" - } - ], - "es": [ - { - "text": "Siempre es muy amable con todos.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "gracious": { - "cefr_source": "B2" - } - }, - "es": { - "amable": { - "cefr_source": "A2" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i109447", - "pos": "noun", - "translations": { - "en": [ - "declension" - ], - "it": [ - "declinazione" - ], - "es": [ - "declinación" - ], - "de": [ - "Deklination", - "Ortsmissweisung", - "Missweisung" - ], - "fr": [ - "déclinaison" - ] - }, - "glosses": { - "en": [ - "the inflection of nouns and pronouns and adjectives in Indo-European languages" - ], - "de": [ - "die Beugung von Substantiven, Pronomen und Adjektiven in den indogermanischen Sprachen" - ] - }, - "examples": { - "it": [ - { - "text": "La declinazione dei nomi latini può essere complessa.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "En latin, les noms ont des déclinaisons.", - "source": "cefr" - } - ] - }, - "votes": { - "it": { - "declinazione": { - "cefr_source": "B2" - } - }, - "fr": { - "déclinaison": { - "cefr_source": "C1" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i18812", - "pos": "adverb", - "translations": { - "en": [ - "fairly", - "fair", - "evenhandedly" - ], - "es": [ - "con justicia", - "imparcialmente", - "justamente" - ] - }, - "glosses": { - "en": [ - "without favoring one party, in a fair evenhanded manner" - ] - }, - "examples": { - "en": [ - { - "text": "deal fairly with one another", - "source": "omw" - } - ], - "es": [ - { - "text": "Llegó justamente a tiempo para la reunión.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "fairly": { - "cefr_source": "B1" - } - }, - "es": { - "justamente": { - "cefr_source": "B2" - } - } - }, - "_sample_bucket": "has_cefr_vote" - }, - { - "source_id": "ili:i44747", - "pos": "noun", - "translations": { - "en": [ - "Centrocercus", - "genus Centrocercus" - ], - "es": [ - "Centrocercus", - "género Centrocercus" - ], - "fr": [ - "centrocercus" - ] - }, - "glosses": { - "en": [ - "sage grouse" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i20736", - "pos": "adverb", - "translations": { - "en": [ - "insinuatingly" - ] - }, - "glosses": { - "en": [ - "in an insinuating manner" - ] - }, - "examples": { - "en": [ - { - "text": "the art book has art to sell, insinuatingly, and for a purpose, like the American muse, which has in fact a tradition to sell, and one which doesn't exist, in painting", - "source": "omw" - } - ] - }, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i25017", - "pos": "verb", - "translations": { - "en": [ - "superordinate" - ] - }, - "glosses": { - "en": [ - "place in a superior order or rank" - ] - }, - "examples": { - "en": [ - { - "text": "These two notions are superordinated to a third", - "source": "omw" - } - ] - }, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i46616", - "pos": "noun", - "translations": { - "en": [ - "sand cat" - ], - "fr": [ - "chat de marguerite", - "chat du désert", - "chat du général marguerite", - "chat des sables" - ] - }, - "glosses": { - "en": [ - "a desert wildcat" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i83491", - "pos": "noun", - "translations": { - "en": [ - "Bangor" - ] - }, - "glosses": { - "en": [ - "a university town in northwestern Wales on the Menai Strait" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i72819", - "pos": "noun", - "translations": { - "en": [ - "Missouri" - ], - "fr": [ - "Saint Peters", - "Joplin", - "Missouri" - ] - }, - "glosses": { - "en": [ - "a dialect of the Chiwere language spoken by the Missouri" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i99797", - "pos": "noun", - "translations": { - "en": [ - "prickly poppy", - "argemone", - "white thistle", - "devil's fig" - ], - "es": [ - "argemone" - ], - "fr": [ - "argemone" - ] - }, - "glosses": { - "en": [ - "any plant of the genus Argemone having large white or yellow flowers and prickly leaves and stems and pods; chiefly of tropical America" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i90317", - "pos": "noun", - "translations": { - "en": [ - "great-uncle", - "granduncle" - ], - "it": [ - "protio", - "prozio" - ], - "es": [ - "tío abuelo" - ], - "fr": [ - "grand-oncle" - ] - }, - "glosses": { - "en": [ - "an uncle of your father or mother" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i53881", - "pos": "noun", - "translations": { - "en": [ - "flour bin" - ], - "es": [ - "frasco de harina", - "tarro de harina" - ] - }, - "glosses": { - "en": [ - "a bin for holding flour" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i58210", - "pos": "noun", - "translations": { - "en": [ - "road map" - ], - "fr": [ - "carte routière" - ] - }, - "glosses": { - "en": [ - "a map showing roads (for automobile travel)" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i82638", - "pos": "noun", - "translations": { - "en": [ - "South American country", - "South American nation" - ] - }, - "glosses": { - "en": [ - "any one of the countries occupying the South American continent" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i71111", - "pos": "noun", - "translations": { - "en": [ - "weekly" - ], - "it": [ - "ebdomadario", - "eddomadario", - "settimanale" - ], - "fr": [ - "hebdomadaire" - ] - }, - "glosses": { - "en": [ - "a periodical that is published every week (or 52 issues per year)" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i10131", - "pos": "adjective", - "translations": { - "en": [ - "embattled" - ], - "it": [ - "GAP!", - "in difficoltà" - ] - }, - "glosses": { - "en": [ - "prepared for battle" - ] - }, - "examples": { - "en": [ - { - "text": "an embattled city", - "source": "omw" - } - ] - }, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i108195", - "pos": "noun", - "translations": { - "en": [ - "mass unit" - ], - "es": [ - "unidad de masa" - ] - }, - "glosses": { - "en": [ - "a unit of measurement for mass" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i82225", - "pos": "noun", - "translations": { - "en": [ - "Wrangell-St. Elias National Park" - ] - }, - "glosses": { - "en": [ - "the largest national park of the United States; located in Alaska" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i47159", - "pos": "noun", - "translations": { - "en": [ - "Fenusa", - "genus-Fenusa" - ], - "es": [ - "Fenusa", - "género Fenusa" - ] - }, - "glosses": { - "en": [ - "birch leaf miner" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i106504", - "pos": "noun", - "translations": { - "en": [ - "entail" - ] - }, - "glosses": { - "en": [ - "land received by fee tail" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i46047", - "pos": "noun", - "translations": { - "en": [ - "Polynesian tattler", - "Heteroscelus incanus" - ], - "fr": [ - "heteroscelus incanus" - ] - }, - "glosses": { - "en": [ - "tattler of Pacific coastal regions" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i71598", - "pos": "noun", - "translations": { - "en": [ - "market letter" - ] - }, - "glosses": { - "en": [ - "a newsletter written by an analyst of the stock market and sold to subscribers" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i115719", - "pos": "noun", - "translations": { - "en": [ - "monosaccharide", - "monosaccharose", - "simple sugar" - ], - "it": [ - "manosio", - "monosaccaride", - "monosio", - "monoso" - ], - "es": [ - "monosacárido" - ], - "de": [ - "Monosaccharid", - "Einfachzucker" - ], - "fr": [ - "ose", - "Ose", - "monosaccharide" - ] - }, - "glosses": { - "en": [ - "a sugar (like sucrose or fructose) that does not hydrolyse to give other sugars; the simplest group of carbohydrates" - ], - "de": [ - "ein Zucker (wie Saccharose oder Fruktose), der nicht zu anderen Zuckern hydrolysiert wird; die einfachste Gruppe der Kohlenhydrate" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_cefr_vote" - }, - { - "source_id": "ili:i74228", - "pos": "noun", - "translations": { - "en": [ - "negotiation", - "dialogue", - "talks" - ], - "it": [ - "contrattazione", - "deal", - "dialogo", - "negoziato", - "negoziazione", - "trattativa" - ], - "es": [ - "gestión", - "negociación", - "tramitación" - ], - "de": [ - "Besprechung", - "Verhandlung" - ], - "fr": [ - "dialogue", - "négociation" - ] - }, - "glosses": { - "en": [ - "a discussion intended to produce an agreement" - ], - "de": [ - "Diskussion zur Ausarbeitung eines Abkommens" - ] - }, - "examples": { - "en": [ - { - "text": "the buyout negotiation lasted several days", - "source": "omw" - }, - { - "text": "they disagreed but kept an open dialogue", - "source": "omw" - }, - { - "text": "talks between Israelis and Palestinians", - "source": "omw" - } - ], - "it": [ - { - "text": "La contrattazione collettiva è importante per i lavoratori.", - "source": "cefr" - }, - { - "text": "Abbiamo chiuso un buon deal.", - "source": "cefr" - }, - { - "text": "È importante mantenere un dialogo aperto.", - "source": "cefr" - }, - { - "text": "Il negoziato per la pace è stato lungo e difficile.", - "source": "cefr" - }, - { - "text": "Le negoziazioni per il nuovo contratto sono state lunghe e complesse.", - "source": "cefr" - }, - { - "text": "Le trattative sono in corso.", - "source": "cefr" - } - ], - "de": [ - { - "text": "Wir haben morgen eine wichtige Besprechung.", - "source": "cefr" - }, - { - "text": "Die Verhandlungen dauerten den ganzen Tag.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "Le dialogue est essentiel pour résoudre les conflits.", - "source": "cefr" - }, - { - "text": "Les négociations ont été longues et difficiles.", - "source": "cefr" - } - ], - "es": [ - { - "text": "La gestión del proyecto fue excelente.", - "source": "cefr" - }, - { - "text": "Las negociaciones fueron difíciles.", - "source": "cefr" - }, - { - "text": "La tramitación de los documentos puede llevar tiempo.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "negotiation": { - "cefr_source": "B2" - }, - "dialogue": { - "cefr_source": "B2" - } - }, - "it": { - "contrattazione": { - "cefr_source": "B2" - }, - "deal": { - "cefr_source": "B1" - }, - "dialogo": { - "cefr_source": "B1" - }, - "negoziato": { - "cefr_source": "B2" - }, - "negoziazione": { - "cefr_source": "B2" - }, - "trattativa": { - "cefr_source": "B2" - } - }, - "de": { - "Besprechung": { - "cefr_source": "B1" - }, - "Verhandlung": { - "cefr_source": "B2" - } - }, - "fr": { - "dialogue": { - "cefr_source": "B1" - }, - "négociation": { - "cefr_source": "B2" - } - }, - "es": { - "gestión": { - "cefr_source": "B2" - }, - "negociación": { - "cefr_source": "B2" - }, - "tramitación": { - "cefr_source": "B2" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i408", - "pos": "adjective", - "translations": { - "en": [ - "aground" - ], - "es": [ - "encallado", - "varado" - ], - "de": [ - "aufgrund", - "dank", - "aufgrund von", - "auf Grund von", - "vermöge", - "infolge", - "auf Grund" - ] - }, - "glosses": { - "en": [ - "stuck in a place where a ship can no longer float" - ], - "de": [ - "an einer Stelle feststecken, an der ein Schiff nicht mehr schwimmen kann" - ] - }, - "examples": { - "en": [ - { - "text": "a ship aground offshore", - "source": "omw" - }, - { - "text": "a boat aground on the beach waiting for the tide to lift it", - "source": "omw" - } - ], - "es": [ - { - "text": "El barco quedó varado en la arena.", - "source": "cefr" - } - ] - }, - "votes": { - "es": { - "varado": { - "cefr_source": "B2" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i41575", - "pos": "noun", - "translations": { - "en": [ - "walkout" - ] - }, - "glosses": { - "en": [ - "the act of walking out (of a meeting or organization) as a sign of protest" - ] - }, - "examples": { - "en": [ - { - "text": "there was a walkout by the Black members as the chairman rose to speak", - "source": "omw" - } - ] - }, - "votes": { - "en": { - "walkout": { - "cefr_source": "B2" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i67480", - "pos": "noun", - "translations": { - "en": [ - "tasting" - ], - "fr": [ - "dégustation" - ] - }, - "glosses": { - "en": [ - "a small amount (especially of food or wine)" - ] - }, - "examples": { - "fr": [ - { - "text": "Nous avons participé à une dégustation de vins.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "tasting": { - "cefr_source": "B1" - } - }, - "fr": { - "dégustation": { - "cefr_source": "B1" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i11256", - "pos": "adjective", - "translations": { - "en": [ - "hobnailed" - ] - }, - "glosses": { - "en": [ - "marked by the wearing of heavy boots studded with hobnails" - ] - }, - "examples": { - "en": [ - { - "text": "hobnailed laborers", - "source": "omw" - } - ] - }, - "votes": {}, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i86151", - "pos": "noun", - "translations": { - "en": [ - "sediment", - "deposit" - ], - "it": [ - "deposito", - "posatura", - "sedimento" - ], - "es": [ - "depósito", - "sedimento" - ], - "de": [ - "Ablagerung", - "Sedimentation", - "Sedimentierung", - "Sedimentbildung" - ], - "fr": [ - "sédiment", - "dépôt" - ] - }, - "glosses": { - "en": [ - "matter that has been deposited by some natural process" - ], - "de": [ - "Materie, die durch einen natürlichen Prozess abgelagert wurde" - ] - }, - "examples": { - "it": [ - { - "text": "Ho lasciato i bagagli al deposito.", - "source": "cefr" - }, - { - "text": "C'era un sedimento sul fondo della bottiglia.", - "source": "cefr" - } - ], - "de": [ - { - "text": "Es gab Ablagerungen in den Rohren.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "Le sédiment au fond du lac est très fin.", - "source": "cefr" - }, - { - "text": "J'ai fait un dépôt à la banque.", - "source": "cefr" - } - ], - "es": [ - { - "text": "Hice un depósito en el banco.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "sediment": { - "cefr_source": "C1" - }, - "deposit": { - "cefr_source": "B1" - } - }, - "it": { - "deposito": { - "cefr_source": "B1" - }, - "sedimento": { - "cefr_source": "B2" - } - }, - "de": { - "Ablagerung": { - "cefr_source": "B2" - } - }, - "fr": { - "sédiment": { - "cefr_source": "B2" - }, - "dépôt": { - "cefr_source": "B1" - } - }, - "es": { - "depósito": { - "cefr_source": "B1" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i45550", - "pos": "noun", - "translations": { - "en": [ - "conch" - ], - "fr": [ - "conque" - ] - }, - "glosses": { - "en": [ - "any of various edible tropical marine gastropods of the genus Strombus having a brightly-colored spiral shell with large outer lip" - ] - }, - "examples": { - "fr": [ - { - "text": "On entend la mer dans une conque.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "conch": { - "cefr_source": "B1" - } - }, - "fr": { - "conque": { - "cefr_source": "B2" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i117521", - "pos": "noun", - "translations": { - "en": [ - "moratorium" - ], - "it": [ - "moratoria" - ], - "fr": [ - "moratoire" - ] - }, - "glosses": { - "en": [ - "a legally authorized postponement before some obligation must be discharged" - ] - }, - "examples": { - "it": [ - { - "text": "Il governo ha imposto una moratoria sui nuovi progetti edilizi.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "Le gouvernement a décrété un moratoire sur la pêche.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "moratorium": { - "cefr_source": "C1" - } - }, - "it": { - "moratoria": { - "cefr_source": "C1" - } - }, - "fr": { - "moratoire": { - "cefr_source": "C1" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i31764", - "pos": "verb", - "translations": { - "en": [ - "return" - ], - "fr": [ - "rendre", - "retourner", - "revenir" - ] - }, - "glosses": { - "en": [ - "return to a previous position; in mathematics" - ] - }, - "examples": { - "en": [ - { - "text": "The point returned to the interior of the figure", - "source": "omw" - } - ], - "fr": [ - { - "text": "Il doit rendre les livres à la bibliothèque.", - "source": "cefr" - }, - { - "text": "Je dois retourner ce livre à la bibliothèque.", - "source": "cefr" - }, - { - "text": "Je dois revenir demain.", - "source": "cefr" - } - ] - }, - "votes": { - "fr": { - "rendre": { - "cefr_source": "A2" - }, - "retourner": { - "cefr_source": "A2" - }, - "revenir": { - "cefr_source": "A1" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i48149", - "pos": "noun", - "translations": { - "en": [ - "post horse", - "post-horse", - "poster" - ], - "it": [ - "cavallo di posta" - ], - "fr": [ - "affiche" - ] - }, - "glosses": { - "en": [ - "a horse kept at an inn or post house for use by mail carriers or for rent to travelers" - ] - }, - "examples": { - "fr": [ - { - "text": "L'affiche du concert est très colorée.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "poster": { - "cefr_source": "A2" - } - }, - "fr": { - "affiche": { - "cefr_source": "A2" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i51126", - "pos": "noun", - "translations": { - "en": [ - "brickwork" - ], - "it": [ - "GAP!", - "muratura in mattoni" - ], - "es": [ - "aparejo", - "calicanto", - "enladrillado", - "mampostería" - ], - "fr": [ - "appareil" - ] - }, - "glosses": { - "en": [ - "masonry done with bricks and mortar" - ] - }, - "examples": { - "fr": [ - { - "text": "J'ai acheté un nouvel appareil photo.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "brickwork": { - "cefr_source": "B2" - } - }, - "fr": { - "appareil": { - "cefr_source": "B1" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i17542", - "pos": "adjective", - "translations": { - "en": [ - "interdisciplinary" - ], - "it": [ - "interdisciplinare", - "multidisciplinare" - ], - "de": [ - "multidisziplinär", - "fachübergreifend", - "interdisziplinär", - "fächerübergreifend" - ], - "fr": [ - "interdisciplinaire" - ] - }, - "glosses": { - "en": [ - "drawing from or characterized by participation of two or more fields of study" - ], - "de": [ - "die Zusammenarbeit mehrerer Disziplinen betreffend\">" - ] - }, - "examples": { - "en": [ - { - "text": "interdisciplinary studies", - "source": "omw" - }, - { - "text": "an interdisciplinary conference", - "source": "omw" - } - ], - "it": [ - { - "text": "Il progetto richiede un approccio interdisciplinare.", - "source": "cefr" - } - ], - "de": [ - { - "text": "Das Projekt ist interdisziplinär angelegt.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "Ce projet de recherche est résolument interdisciplinaire.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "interdisciplinary": { - "cefr_source": "C1" - } - }, - "it": { - "interdisciplinare": { - "cefr_source": "C1" - } - }, - "de": { - "interdisziplinär": { - "cefr_source": "C1" - } - }, - "fr": { - "interdisciplinaire": { - "cefr_source": "C1" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i69459", - "pos": "noun", - "translations": { - "en": [ - "new edition" - ], - "it": [ - "riedizione" - ], - "fr": [ - "new edition" - ] - }, - "glosses": { - "en": [ - "a publication (such as a book) that has been modified or updated and offered again for sale" - ] - }, - "examples": { - "it": [ - { - "text": "Il libro è stato pubblicato in una nuova riedizione.", - "source": "cefr" - } - ] - }, - "votes": { - "it": { - "riedizione": { - "cefr_source": "C1" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i75841", - "pos": "noun", - "translations": { - "en": [ - "stampede" - ], - "de": [ - "Stampede", - "Herdenpanik" - ], - "fr": [ - "débandade" - ] - }, - "glosses": { - "en": [ - "a wild headlong rush of frightened animals (horses or cattle)" - ], - "de": [ - "eine wilde, kopfüber laufende Flucht von verängstigten Tieren (Pferden oder Rindern)" - ] - }, - "examples": { - "fr": [ - { - "text": "Après l'explosion, ce fut la débandade générale.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "stampede": { - "cefr_source": "B2" - } - }, - "fr": { - "débandade": { - "cefr_source": "C1" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i67108", - "pos": "noun", - "translations": { - "en": [ - "stocktaking", - "stock-taking" - ], - "it": [ - "inventario" - ], - "es": [ - "balance" - ] - }, - "glosses": { - "en": [ - "reappraisal of a situation or position or outlook" - ] - }, - "examples": { - "it": [ - { - "text": "Dobbiamo fare l'inventario del magazzino.", - "source": "cefr" - } - ], - "es": [ - { - "text": "Es importante mantener un balance entre trabajo y vida personal.", - "source": "cefr" - } - ] - }, - "votes": { - "it": { - "inventario": { - "cefr_source": "B2" - } - }, - "es": { - "balance": { - "cefr_source": "B1" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i14270", - "pos": "adjective", - "translations": { - "en": [ - "cockamamie", - "cockamamy", - "goofy", - "sappy", - "silly", - "wacky", - "whacky", - "zany" - ], - "es": [ - "tonto" - ], - "de": [ - "albern", - "naiv", - "unreif", - "kindsköpfig", - "kindlich", - "kindisch", - "unentwickelt", - "kindhaft", - "pueril", - "infantil", - "puerilistisch" - ], - "fr": [ - "déraisonnable", - "fou", - "drôle", - "aberrant" - ] - }, - "glosses": { - "en": [ - "ludicrous, foolish" - ], - "de": [ - "lächerlich, töricht" - ] - }, - "examples": { - "en": [ - { - "text": "gave me a cockamamie reason for not going", - "source": "omw" - }, - { - "text": "wore a goofy hat", - "source": "omw" - }, - { - "text": "a silly idea", - "source": "omw" - }, - { - "text": "some wacky plan for selling more books", - "source": "omw" - } - ], - "de": [ - { - "text": "Hör auf, so albern zu sein!", - "source": "cefr" - }, - { - "text": "Sie ist manchmal etwas naiv.", - "source": "cefr" - }, - { - "text": "Die Früchte sind noch unreif.", - "source": "cefr" - }, - { - "text": "Sie hat eine sehr kindliche Freude.", - "source": "cefr" - }, - { - "text": "Sein Verhalten war ziemlich kindisch.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "Ses exigences sont déraisonnables.", - "source": "cefr" - }, - { - "text": "C'est une idée folle.", - "source": "cefr" - }, - { - "text": "C'est une histoire drôle.", - "source": "cefr" - }, - { - "text": "Son comportement était aberrant et choquant.", - "source": "cefr" - } - ], - "es": [ - { - "text": "No seas tonto, eso no es verdad.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "goofy": { - "cefr_source": "B1" - }, - "sappy": { - "cefr_source": "B2" - }, - "silly": { - "cefr_source": "A2" - }, - "wacky": { - "cefr_source": "B2" - }, - "zany": { - "cefr_source": "B2" - } - }, - "de": { - "albern": { - "cefr_source": "B1" - }, - "naiv": { - "cefr_source": "B1" - }, - "unreif": { - "cefr_source": "B1" - }, - "kindlich": { - "cefr_source": "B1" - }, - "kindisch": { - "cefr_source": "B1" - } - }, - "fr": { - "déraisonnable": { - "cefr_source": "B2" - }, - "fou": { - "cefr_source": "B1" - }, - "drôle": { - "cefr_source": "A2" - }, - "aberrant": { - "cefr_source": "C1" - } - }, - "es": { - "tonto": { - "cefr_source": "A2" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i1291", - "pos": "adjective", - "translations": { - "en": [ - "unifacial" - ] - }, - "glosses": { - "en": [ - "having but one principal or specialized surface" - ] - }, - "examples": { - "en": [ - { - "text": "a primitive unifacial flint tool", - "source": "omw" - } - ] - }, - "votes": {}, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i73668", - "pos": "noun", - "translations": { - "en": [ - "cantata", - "oratorio" - ], - "it": [ - "cantata", - "oratorio" - ], - "es": [ - "oratorio" - ], - "de": [ - "Andachtsraum", - "Oratorium", - "Gebetsraum" - ], - "fr": [ - "oratorio", - "cantate" - ] - }, - "glosses": { - "en": [ - "a musical composition for voices and orchestra based on a religious text" - ], - "de": [ - "eine musikalische Komposition für Stimmen und Orchester auf der Grundlage eines religiösen Textes" - ] - }, - "examples": { - "it": [ - { - "text": "I bambini giocano nell'oratorio della chiesa.", - "source": "cefr" - } - ], - "de": [ - { - "text": "Händels \"Messiah\" ist ein berühmtes Oratorium.", - "source": "cefr" - } - ], - "es": [ - { - "text": "El oratorio de la iglesia es un lugar de paz y reflexión.", - "source": "cefr" - } - ] - }, - "votes": { - "it": { - "oratorio": { - "cefr_source": "B1" - } - }, - "de": { - "Oratorium": { - "cefr_source": "C1" - } - }, - "es": { - "oratorio": { - "cefr_source": "C1" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i39774", - "pos": "noun", - "translations": { - "en": [ - "respiration" - ], - "es": [ - "respiración" - ] - }, - "glosses": { - "en": [ - "a single complete act of breathing in and out" - ] - }, - "examples": { - "en": [ - { - "text": "thirty respirations per minute", - "source": "omw" - } - ], - "es": [ - { - "text": "Su respiración era lenta y profunda.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "respiration": { - "cefr_source": "B2" - } - }, - "es": { - "respiración": { - "cefr_source": "B1" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i28838", - "pos": "verb", - "translations": { - "en": [ - "unplug", - "disconnect" - ], - "fr": [ - "débrancher" - ] - }, - "glosses": { - "en": [ - "pull the plug of (electrical appliances) and render inoperable" - ] - }, - "examples": { - "en": [ - { - "text": "unplug the hair dryer after using it", - "source": "omw" - } - ], - "fr": [ - { - "text": "N'oubliez pas de débrancher l'appareil après utilisation.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "unplug": { - "cefr_source": "A2" - } - }, - "fr": { - "débrancher": { - "cefr_source": "A2" - } - } - }, - "_sample_bucket": "has_glosses_and_examples" - }, - { - "source_id": "ili:i85884", - "pos": "noun", - "translations": { - "en": [ - "North Sea" - ], - "es": [ - "Mar del Norte" - ], - "de": [ - "Nordsee", - "Deutsches Meer" - ], - "fr": [ - "mer du Nord", - "Mer du Nord" - ] - }, - "glosses": { - "en": [ - "an arm of the North Atlantic between the British Isles and Scandinavia; oil was discovered under the North Sea in 1970" - ], - "de": [ - "ein Arm des Nordatlantiks zwischen den Britischen Inseln und Skandinavien; 1970 wurde unter der Nordsee Öl entdeckt" - ] - }, - "examples": { - "de": [ - { - "text": "Wir fahren im Sommer an die Nordsee.", - "source": "cefr" - } - ] - }, - "votes": { - "de": { - "Nordsee": { - "cefr_source": "A2" - } - } - }, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i57058", - "pos": "noun", - "translations": { - "en": [ - "patriarchal cross" - ], - "es": [ - "cruz patriarcal" - ], - "de": [ - "Erzbischofskreuz", - "Spanisches Kreuz", - "Ungarisches Kreuz", - "Patriarchenkreuz", - "Patriarchenhochkreuz" - ] - }, - "glosses": { - "en": [ - "a cross with two crossbars" - ], - "de": [ - "ein Kreuz mit zwei Querbalken" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i14067", - "pos": "adjective", - "translations": { - "en": [ - "maximizing", - "maximising" - ], - "fr": [ - "maximaliste" - ] - }, - "glosses": { - "en": [ - "making as great as possible" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i57206", - "pos": "noun", - "translations": { - "en": [ - "photocathode" - ], - "es": [ - "fotocátodo" - ], - "de": [ - "Photokathode", - "Fotokathode" - ], - "fr": [ - "photocathode" - ] - }, - "glosses": { - "en": [ - "a cathode that emits electrons when illuminated" - ], - "de": [ - "eine Kathode, die bei Beleuchtung Elektronen abgibt" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i97025", - "pos": "noun", - "translations": { - "en": [ - "Stockton", - "Frank Stockton", - "Francis Richard Stockton" - ], - "es": [ - "Francis Richard Stockton", - "Frank Stockton", - "Stockton" - ], - "fr": [ - "Stockton" - ] - }, - "glosses": { - "en": [ - "United States writer (1834-1902)" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i101248", - "pos": "noun", - "translations": { - "en": [ - "obeche" - ] - }, - "glosses": { - "en": [ - "the wood of an African obeche tree; used especially for veneering" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i94985", - "pos": "noun", - "translations": { - "en": [ - "Eames", - "Charles Eames" - ], - "es": [ - "Charles Eames" - ] - }, - "glosses": { - "en": [ - "United States designer noted for an innovative series of chairs (1907-1978)" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i16699", - "pos": "adjective", - "translations": { - "en": [ - "mensural", - "measured", - "mensurable" - ], - "es": [ - "mensural" - ] - }, - "glosses": { - "en": [ - "having notes of fixed rhythmic value" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i99999", - "pos": "noun", - "translations": { - "en": [ - "China aster", - "Callistephus chinensis" - ], - "fr": [ - "callistephus chinensis" - ] - }, - "glosses": { - "en": [ - "valued for their beautiful flowers in a wide range of clear bright colors; grown primarily for cutting" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i75135", - "pos": "noun", - "translations": { - "en": [ - "kiss of death" - ], - "fr": [ - "baiser de la mort" - ] - }, - "glosses": { - "en": [ - "something that is ruinous" - ] - }, - "examples": { - "en": [ - { - "text": "if this were known it would be the kiss of death for my political career", - "source": "omw" - } - ] - }, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i36428", - "pos": "noun", - "translations": { - "en": [ - "dark adaptation" - ] - }, - "glosses": { - "en": [ - "the process of adjusting the eyes to low levels of illumination; cones adapt first; rods continue to adapt for up to four hours" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i103092", - "pos": "noun", - "translations": { - "en": [ - "saw palmetto", - "scrub palmetto", - "Serenoa repens" - ] - }, - "glosses": { - "en": [ - "small hardy clump-forming spiny palm of southern United States" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i14834", - "pos": "adjective", - "translations": { - "en": [ - "zoic" - ] - }, - "glosses": { - "en": [ - "pertaining to animals or animal life or action" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i25953", - "pos": "verb", - "translations": { - "en": [ - "blog" - ], - "es": [ - "blogear" - ] - }, - "glosses": { - "en": [ - "read, write, or edit a shared on-line journal" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i24441", - "pos": "verb", - "translations": { - "en": [ - "ream" - ], - "es": [ - "taladrar" - ] - }, - "glosses": { - "en": [ - "enlarge with a reamer" - ] - }, - "examples": { - "en": [ - { - "text": "ream a hole", - "source": "omw" - } - ] - }, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i60874", - "pos": "noun", - "translations": { - "en": [ - "virtual memory", - "virtual storage" - ], - "it": [ - "memoria virtuale" - ], - "es": [ - "memoria virtual" - ], - "fr": [ - "mémoire virtuelle" - ] - }, - "glosses": { - "en": [ - "(computer science) memory created by using the hard disk to simulate additional random-access memory; the addressable storage space available to the user of a computer system in which virtual addresses are mapped into real addresses" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i105979", - "pos": "noun", - "translations": { - "en": [ - "Dryopteris", - "genus Dryopteris" - ], - "fr": [ - "Dryopteris", - "dryopteris" - ] - }, - "glosses": { - "en": [ - "large widespread genus of medium-sized terrestrial ferns; in some classification systems placed in Polypodiaceae" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i44411", - "pos": "noun", - "translations": { - "en": [ - "blue racer", - "Coluber constrictor flaviventris" - ], - "fr": [ - "coluber constrictor" - ] - }, - "glosses": { - "en": [ - "bluish-green blacksnake found from Ohio to Texas" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i14592", - "pos": "adjective", - "translations": { - "en": [ - "anagrammatic", - "anagrammatical" - ], - "it": [ - "anagrammatico" - ] - }, - "glosses": { - "en": [ - "related to anagrams or containing or making an anagram" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i5174", - "pos": "adjective", - "translations": { - "en": [ - "protrusile", - "protrusible" - ], - "fr": [ - "protrusible" - ] - }, - "glosses": { - "en": [ - "capable of being thrust forward, as the tongue" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "no_glosses_no_examples" - }, - { - "source_id": "ili:i99278", - "pos": "noun", - "translations": { - "en": [ - "pink calla", - "Zantedeschia rehmanii" - ] - }, - "glosses": { - "en": [ - "calla having a rose-colored spathe" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i97983", - "pos": "noun", - "translations": { - "en": [ - "phosphorescence" - ], - "it": [ - "fosforescenza", - "fotoluminescenza" - ], - "fr": [ - "phosphorescence" - ] - }, - "glosses": { - "en": [ - "a fluorescence that persists after the bombarding radiation has ceased" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i54194", - "pos": "noun", - "translations": { - "en": [ - "garrison cap", - "overseas cap" - ] - }, - "glosses": { - "en": [ - "a wedge-shaped wool or cotton cap; worn as part of a uniform" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i102972", - "pos": "noun", - "translations": { - "en": [ - "Tipuana", - "genus Tipuana" - ], - "fr": [ - "tipuana" - ] - }, - "glosses": { - "en": [ - "one species: South American tree: tipu tree" - ] - }, - "examples": {}, - "votes": {}, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i55386", - "pos": "noun", - "translations": { - "en": [ - "king" - ], - "fr": [ - "roi" - ] - }, - "glosses": { - "en": [ - "a checker that has been moved to the opponent's first row where it is promoted to a piece that is free to move either forward or backward" - ] - }, - "examples": { - "fr": [ - { - "text": "Le roi a visité la ville.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "king": { - "cefr_source": "A2" - } - }, - "fr": { - "roi": { - "cefr_source": "B1" - } - } - }, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i26482", - "pos": "verb", - "translations": { - "en": [ - "articulate", - "enunciate", - "vocalize", - "vocalise" - ], - "it": [ - "articolare", - "enunciare", - "enunziare", - "scandire" - ], - "es": [ - "articular" - ], - "de": [ - "ausdrücken", - "artikulieren" - ], - "fr": [ - "articuler", - "exprimer", - "énoncer", - "formuler", - "vocaliser" - ] - }, - "glosses": { - "en": [ - "express or state clearly" - ], - "de": [ - "klar ausdrücken oder erklären" - ] - }, - "examples": { - "it": [ - { - "text": "È importante articolare bene le parole.", - "source": "cefr" - } - ], - "de": [ - { - "text": "Er konnte seine Gefühle nicht ausdrücken.", - "source": "cefr" - }, - { - "text": "Er konnte seine Gedanken nicht klar artikulieren.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "Il faut bien articuler pour être compris.", - "source": "cefr" - }, - { - "text": "Il est difficile d'exprimer ses sentiments.", - "source": "cefr" - }, - { - "text": "Le professeur a énoncé les règles clairement.", - "source": "cefr" - }, - { - "text": "Il a formulé une question très pertinente.", - "source": "cefr" - } - ], - "es": [ - { - "text": "Es importante articular bien las palabras al hablar en público.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "articulate": { - "cefr_source": "B2" - } - }, - "it": { - "articolare": { - "cefr_source": "B2" - } - }, - "de": { - "ausdrücken": { - "cefr_source": "B1" - }, - "artikulieren": { - "cefr_source": "B2" - } - }, - "fr": { - "articuler": { - "cefr_source": "B1" - }, - "exprimer": { - "cefr_source": "B1" - }, - "énoncer": { - "cefr_source": "B2" - }, - "formuler": { - "cefr_source": "B2" - } - }, - "es": { - "articular": { - "cefr_source": "B2" - } - } - }, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i22492", - "pos": "verb", - "translations": { - "en": [ - "spike" - ] - }, - "glosses": { - "en": [ - "manifest a sharp increase" - ] - }, - "examples": { - "en": [ - { - "text": "the voltage spiked", - "source": "omw" - } - ] - }, - "votes": {}, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i26383", - "pos": "verb", - "translations": { - "en": [ - "redefine" - ], - "fr": [ - "redéfinir" - ] - }, - "glosses": { - "en": [ - "give a new or different definition of (a word)" - ] - }, - "examples": { - "fr": [ - { - "text": "Il est temps de redéfinir nos objectifs.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "redefine": { - "cefr_source": "B2" - } - }, - "fr": { - "redéfinir": { - "cefr_source": "B2" - } - } - }, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i22943", - "pos": "verb", - "translations": { - "en": [ - "slake", - "abate", - "slack" - ], - "es": [ - "aflojar", - "reducir" - ], - "fr": [ - "descendre", - "cesser", - "réduire", - "ralentir", - "amoindrir", - "diminuer", - "supprimer" - ] - }, - "glosses": { - "en": [ - "make less active or intense" - ] - }, - "examples": { - "fr": [ - { - "text": "Nous allons descendre au rez-de-chaussée.", - "source": "cefr" - }, - { - "text": "La pluie a cessé de tomber.", - "source": "cefr" - }, - { - "text": "Nous devons réduire nos dépenses.", - "source": "cefr" - }, - { - "text": "Il faut ralentir avant le virage.", - "source": "cefr" - }, - { - "text": "Ces mesures visent à amoindrir l'impact de la crise.", - "source": "cefr" - }, - { - "text": "Les prix ont commencé à diminuer.", - "source": "cefr" - }, - { - "text": "Il faut supprimer les fichiers inutiles.", - "source": "cefr" - } - ], - "es": [ - { - "text": "Tienes que aflojar el nudo.", - "source": "cefr" - }, - { - "text": "Necesitamos reducir el consumo de energía.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "abate": { - "cefr_source": "C1" - } - }, - "fr": { - "descendre": { - "cefr_source": "A2" - }, - "cesser": { - "cefr_source": "B1" - }, - "réduire": { - "cefr_source": "B1" - }, - "ralentir": { - "cefr_source": "B1" - }, - "amoindrir": { - "cefr_source": "C1" - }, - "diminuer": { - "cefr_source": "B1" - }, - "supprimer": { - "cefr_source": "B2" - } - }, - "es": { - "aflojar": { - "cefr_source": "B1" - }, - "reducir": { - "cefr_source": "B1" - } - } - }, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i31348", - "pos": "verb", - "translations": { - "en": [ - "romp" - ] - }, - "glosses": { - "en": [ - "run easily and fairly fast" - ] - }, - "examples": {}, - "votes": { - "en": { - "romp": { - "cefr_source": "B2" - } - } - }, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i10413", - "pos": "adjective", - "translations": { - "en": [ - "imprudent" - ], - "it": [ - "imprudente", - "incauto" - ], - "es": [ - "imprudente", - "insensato" - ], - "fr": [ - "imprudent" - ] - }, - "glosses": { - "en": [ - "not prudent or wise" - ] - }, - "examples": { - "en": [ - { - "text": "very imprudent of her mother to encourage her in such silly romantic ideas", - "source": "omw" - }, - { - "text": "\"would be imprudent for a noneconomist to talk about the details of economic policy\"- A.M.Schlesinger", - "source": "omw" - } - ], - "it": [ - { - "text": "È stato imprudente guidare così velocemente.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "C'était imprudent de traverser sans regarder.", - "source": "cefr" - } - ], - "es": [ - { - "text": "Fue una decisión imprudente conducir tan rápido.", - "source": "cefr" - }, - { - "text": "Fue una decisión insensata.", - "source": "cefr" - } - ] - }, - "votes": { - "it": { - "imprudente": { - "cefr_source": "B2" - } - }, - "fr": { - "imprudent": { - "cefr_source": "B2" - } - }, - "es": { - "imprudente": { - "cefr_source": "B2" - }, - "insensato": { - "cefr_source": "B2" - } - } - }, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i8645", - "pos": "adjective", - "translations": { - "en": [ - "metaphysical" - ], - "es": [ - "metafísico" - ], - "fr": [ - "métaphysique" - ] - }, - "glosses": { - "en": [ - "without material form or substance" - ] - }, - "examples": { - "en": [ - { - "text": "metaphysical forces", - "source": "omw" - } - ] - }, - "votes": { - "en": { - "metaphysical": { - "cefr_source": "C1" - } - } - }, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i6969", - "pos": "adjective", - "translations": { - "en": [ - "all-important", - "all important", - "crucial", - "essential", - "of the essence" - ], - "it": [ - "essenziale" - ], - "es": [ - "crucial", - "esencial" - ], - "de": [ - "bedeutsam", - "wesentlich", - "wichtig", - "prägnant", - "hauptsächlich", - "gehaltvoll", - "aussagekräftig", - "signifikant" - ], - "fr": [ - "essentiel" - ] - }, - "glosses": { - "en": [ - "of the greatest importance" - ], - "de": [ - "von allergrößter Bedeutung" - ] - }, - "examples": { - "en": [ - { - "text": "the all-important subject of disarmament", - "source": "omw" - }, - { - "text": "crucial information", - "source": "omw" - }, - { - "text": "in chess cool nerves are of the essence", - "source": "omw" - } - ], - "it": [ - { - "text": "L'acqua è essenziale per la vita.", - "source": "cefr" - } - ], - "de": [ - { - "text": "Das war ein bedeutsamer Moment in der Geschichte.", - "source": "cefr" - }, - { - "text": "Das ist ein wesentlicher Unterschied.", - "source": "cefr" - }, - { - "text": "Das ist eine wichtige Information.", - "source": "cefr" - }, - { - "text": "Er formulierte seine Gedanken sehr prägnant.", - "source": "cefr" - }, - { - "text": "Die Studie lieferte aussagekräftige Ergebnisse.", - "source": "cefr" - }, - { - "text": "Es gab eine signifikante Veränderung.", - "source": "cefr" - } - ], - "fr": [ - { - "text": "C'est essentiel de bien manger pour rester en forme.", - "source": "cefr" - } - ], - "es": [ - { - "text": "Es crucial que lleguemos a tiempo.", - "source": "cefr" - }, - { - "text": "El agua es esencial para la vida.", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "crucial": { - "cefr_source": "B2" - }, - "essential": { - "cefr_source": "B1" - } - }, - "it": { - "essenziale": { - "cefr_source": "B1" - } - }, - "de": { - "bedeutsam": { - "cefr_source": "B2" - }, - "wesentlich": { - "cefr_source": "B1" - }, - "wichtig": { - "cefr_source": "A1" - }, - "prägnant": { - "cefr_source": "B2" - }, - "aussagekräftig": { - "cefr_source": "B2" - }, - "signifikant": { - "cefr_source": "C1" - } - }, - "fr": { - "essentiel": { - "cefr_source": "B1" - } - }, - "es": { - "crucial": { - "cefr_source": "B2" - }, - "esencial": { - "cefr_source": "B1" - } - } - }, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i13690", - "pos": "adjective", - "translations": { - "en": [ - "round-arm" - ] - }, - "glosses": { - "en": [ - "with the arm swung round at shoulder height" - ] - }, - "examples": { - "en": [ - { - "text": "round-arm bowling", - "source": "omw" - } - ] - }, - "votes": {}, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i16993", - "pos": "adjective", - "translations": { - "en": [ - "Monacan", - "Monegasque" - ], - "it": [ - "monegasco" - ], - "fr": [ - "monégasque" - ] - }, - "glosses": { - "en": [ - "of or relating to or characteristic of Monaco or its people" - ] - }, - "examples": { - "fr": [ - { - "text": "Il est de nationalité monégasque.", - "source": "cefr" - } - ] - }, - "votes": { - "fr": { - "monégasque": { - "cefr_source": "B1" - } - } - }, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i18824", - "pos": "adverb", - "translations": { - "en": [ - "here", - "hither" - ], - "it": [ - "qua", - "qui" - ], - "fr": [ - "ici", - "çà", - "par ici" - ] - }, - "glosses": { - "en": [ - "to this place (especially toward the speaker)" - ] - }, - "examples": { - "en": [ - { - "text": "come here, please", - "source": "omw" - } - ], - "it": [ - { - "text": "Vieni qua, per favore.", - "source": "cefr" - }, - { - "text": "Vieni qui!", - "source": "cefr" - } - ], - "fr": [ - { - "text": "Venez ici !", - "source": "cefr" - } - ] - }, - "votes": { - "en": { - "here": { - "cefr_source": "A1" - }, - "hither": { - "cefr_source": "C2" - } - }, - "it": { - "qua": { - "cefr_source": "A1" - }, - "qui": { - "cefr_source": "A1" - } - }, - "fr": { - "ici": { - "cefr_source": "A1" - } - } - }, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i19641", - "pos": "adverb", - "translations": { - "en": [ - "head-on" - ], - "es": [ - "de frente" - ] - }, - "glosses": { - "en": [ - "with the front foremost" - ] - }, - "examples": { - "en": [ - { - "text": "the cars collided head-on", - "source": "omw" - } - ] - }, - "votes": {}, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i21417", - "pos": "adverb", - "translations": { - "en": [ - "sweepingly" - ] - }, - "glosses": { - "en": [ - "in a sweeping manner" - ] - }, - "examples": { - "en": [ - { - "text": "he sweepingly condemned the entire population of the country for the war crimes", - "source": "omw" - } - ] - }, - "votes": {}, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i20131", - "pos": "adverb", - "translations": { - "en": [ - "gallantly", - "chivalrously" - ], - "it": [ - "galantemente" - ], - "fr": [ - "chevaleresquement" - ] - }, - "glosses": { - "en": [ - "in a gallant manner" - ] - }, - "examples": { - "en": [ - { - "text": "he gallantly offered to take her home", - "source": "omw" - } - ] - }, - "votes": {}, - "_sample_bucket": "pos_spread" - }, - { - "source_id": "ili:i20516", - "pos": "adverb", - "translations": { - "en": [ - "fractiously" - ] - }, - "glosses": { - "en": [ - "in a fractious manner" - ] - }, - "examples": { - "en": [ - { - "text": "the horse was behaving fractiously and refused to jump", - "source": "omw" - } - ] - }, - "votes": {}, - "_sample_bucket": "pos_spread" - } -] \ No newline at end of file diff --git a/data-pipeline/test/scripts/sample.ts b/data-pipeline/test/scripts/sample.ts deleted file mode 100644 index 63ead71..0000000 --- a/data-pipeline/test/scripts/sample.ts +++ /dev/null @@ -1,205 +0,0 @@ -import fs from "node:fs/promises"; -import path from "node:path"; -import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; -import type { SupportedLanguageCode, SupportedPos } from "@lila/shared"; - -// ── Types ───────────────────────────────────────────────────────────────────── - -type Example = { text: string; source: "omw" | "cefr" }; - -type AnnotatedRecord = { - source_id: string; - pos: SupportedPos; - translations: Partial>; - glosses: Partial>; - examples: Partial>; - votes: Partial< - Record> - >; -}; - -type SampleRecord = AnnotatedRecord & { _sample_bucket: string }; - -// ── Constants ───────────────────────────────────────────────────────────────── - -const PATHS = { - annotatedDir: "stage-2-annotate/output", - output: "test/output/sample.json", -}; - -const BUCKET_SIZE = 20; - -// ── Bucket predicates ───────────────────────────────────────────────────────── - -type Bucket = { name: string; predicate: (record: AnnotatedRecord) => boolean }; - -const BUCKETS: Bucket[] = [ - { - name: "has_cefr_vote", - predicate: (r) => - Object.values(r.votes).some( - (langVotes) => Object.keys(langVotes ?? {}).length > 0, - ), - }, - { - name: "no_cefr_vote", - predicate: (r) => - Object.values(r.votes).every( - (langVotes) => Object.keys(langVotes ?? {}).length === 0, - ), - }, - { - name: "has_glosses_and_examples", - predicate: (r) => - Object.keys(r.glosses).length > 0 && Object.keys(r.examples).length > 0, - }, - { - name: "no_glosses_no_examples", - predicate: (r) => - !r.glosses["fr"] && - !r.examples["fr"] && - !r.votes["fr"] && - !r.glosses["es"] && - !r.examples["es"] && - !r.votes["es"], - }, - { - name: "pos_spread", - predicate: () => true, // sampled separately to ensure POS coverage - }, -]; - -// ── Sampling ────────────────────────────────────────────────────────────────── - -function sampleBucket( - records: AnnotatedRecord[], - predicate: (r: AnnotatedRecord) => boolean, - size: number, - exclude: Set, -): AnnotatedRecord[] { - const candidates = records.filter( - (r) => !exclude.has(r.source_id) && predicate(r), - ); - - // Shuffle for random sampling - for (let i = candidates.length - 1; i > 0; i--) { - const j = Math.floor(Math.random() * (i + 1)); - [candidates[i], candidates[j]] = [candidates[j]!, candidates[i]!]; - } - - return candidates.slice(0, size); -} - -function samplePosBucket( - records: AnnotatedRecord[], - exclude: Set, -): AnnotatedRecord[] { - const posList: SupportedPos[] = ["noun", "verb", "adjective", "adverb"]; - const perPos = Math.floor(BUCKET_SIZE / posList.length); - const result: AnnotatedRecord[] = []; - - for (const pos of posList) { - const sampled = sampleBucket( - records, - (r) => r.pos === pos, - perPos, - exclude, - ); - result.push(...sampled); - } - - return result; -} - -// ── Loading ─────────────────────────────────────────────────────────────────── - -async function loadAnnotated(): Promise { - // Load all language files and merge votes into a single record set. - // Use en.json as the base record structure since it has the most complete - // glosses and examples. Votes from all other languages are merged in. - const baseRaw = await fs.readFile( - path.join(PATHS.annotatedDir, "en.json"), - "utf-8", - ); - const base = JSON.parse(baseRaw) as AnnotatedRecord[]; - - // Build a map for fast lookup by source_id - const byId = new Map(); - for (const record of base) { - byId.set(record.source_id, record); - } - - // Merge votes from remaining language files - for (const lang of SUPPORTED_LANGUAGE_CODES) { - if (lang === "en") continue; - const raw = await fs.readFile( - path.join(PATHS.annotatedDir, `${lang}.json`), - "utf-8", - ); - const records = JSON.parse(raw) as AnnotatedRecord[]; - - for (const record of records) { - const base = byId.get(record.source_id); - if (!base) continue; - - // Merge votes - for (const [l, langVotes] of Object.entries(record.votes)) { - if (!base.votes[l as SupportedLanguageCode]) { - base.votes[l as SupportedLanguageCode] = {}; - } - Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes); - } - - // Merge examples from CEFR source files not in base - for (const [l, examples] of Object.entries(record.examples)) { - const lang = l as SupportedLanguageCode; - if (!base.examples[lang]) { - base.examples[lang] = examples as Example[]; - } - } - } - } - - return [...byId.values()]; -} - -// ── Main ───────────────────────────────────────────────────────────────────── - -async function main(): Promise { - console.log("Loading annotated files..."); - const records = await loadAnnotated(); - console.log(` Loaded ${records.length.toLocaleString()} synsets`); - - const sampled: SampleRecord[] = []; - const seen = new Set(); - - // Sample each bucket except pos_spread - for (const bucket of BUCKETS.filter((b) => b.name !== "pos_spread")) { - const results = sampleBucket(records, bucket.predicate, BUCKET_SIZE, seen); - for (const r of results) { - seen.add(r.source_id); - sampled.push({ ...r, _sample_bucket: bucket.name }); - } - console.log(` ${bucket.name}: ${results.length} records`); - } - - // Sample pos_spread bucket - const posResults = samplePosBucket(records, seen); - for (const r of posResults) { - seen.add(r.source_id); - sampled.push({ ...r, _sample_bucket: "pos_spread" }); - } - console.log(` pos_spread: ${posResults.length} records`); - - console.log(`\nTotal sampled: ${sampled.length} records`); - - // Write output - await fs.mkdir(path.dirname(PATHS.output), { recursive: true }); - await fs.writeFile(PATHS.output, JSON.stringify(sampled, null, 2), "utf-8"); - console.log(`Wrote sample → ${PATHS.output}`); -} - -main().catch((err) => { - console.error(err); - process.exit(1); -}); diff --git a/data-pipeline/tsconfig.json b/data-pipeline/tsconfig.json index 83c3053..19bf9bb 100644 --- a/data-pipeline/tsconfig.json +++ b/data-pipeline/tsconfig.json @@ -4,9 +4,8 @@ "module": "NodeNext", "moduleResolution": "NodeNext", "outDir": "dist", - "rootDir": ".", - "types": ["node"], + "rootDir": "scripts", }, "references": [{ "path": "../packages/shared" }], - "include": ["./**/*"], + "include": ["scripts/**/*"], } diff --git a/documentation/llm-setup.md b/documentation/llm-setup.md deleted file mode 100644 index 6cc1f91..0000000 --- a/documentation/llm-setup.md +++ /dev/null @@ -1,295 +0,0 @@ -# LLM Setup — lila pipeline - -This document covers the LLM infrastructure for stage 3 (enrich) of the lila -data pipeline. It documents the hardware constraints, supported providers, -model recommendations, and how to configure and swap providers in the test -and production scripts. - ---- - -## Hardware (dev machine) - -| Component | Spec | -|---|---| -| CPU | Intel Core i7-6500U (2 cores / 4 threads @ 3.10 GHz) | -| RAM | 8 GB | -| GPU | NVIDIA GeForce GTX 950M — 4 GB VRAM (Maxwell, CUDA compute 5.0) | -| OS | Debian GNU/Linux 13 (trixie) x86_64 | - -**Local inference verdict:** viable for small/quantized models, not for -production runs. See the [Local inference](#local-inference-llamacpp) section -for details. - ---- - -## Provider overview - -The enrich script uses a single, swappable provider config. All providers -except Anthropic expose an OpenAI-compatible API, so the same client code -works across all of them — only `baseURL`, `apiKey`, and `model` change. - -| Provider | Use case | Cost | Rate limits | -|---|---|---|---| -| llama.cpp (local) | Quality testing, overnight dev runs | Free (electricity) | None | -| OpenRouter (free tier) | Quality comparison, multi-model evaluation | Free | 50 req/day, 20 req/min | -| OpenRouter (paid) | Production runs if local quality insufficient | Pay-per-token | None | -| Anthropic API | Quality baseline / reference | Pay-per-token | Standard | - ---- - -## Local inference (llama.cpp) - -### Why local inference is worth testing - -Time is not a constraint — the pipeline scripts are fully resumable. The -laptop can run overnight for multiple nights. The only question is output -quality, which the test script evaluates empirically. - -### Hardware constraints - -The GTX 950M has 4 GB VRAM and Maxwell architecture (CUDA compute 5.0). -llama.cpp supports Maxwell via CUDA backend but newer builds may require -the `--cuda-no-kv-offload` flag depending on the version. - -llama.cpp splits model layers between GPU and CPU automatically via -`--n-gpu-layers`. You set how many layers go on the GPU; the rest run on -CPU/RAM. This means a model larger than VRAM is not a dead end — it runs -in hybrid mode, slower than full-GPU but much faster than pure CPU. - -Practical estimates for this hardware (~3.5 GB VRAM usable after drivers): - -| Model size | Q4 VRAM | Mode | Est. speed | -|---|---|---|---| -| 3B | ~2.0 GB | Full GPU | ~15–20 tok/s | -| 4B | ~2.5 GB | Full GPU | ~12–18 tok/s | -| 7B | ~4.5 GB | Hybrid (~26/32 layers on GPU) | ~8–12 tok/s | -| 13B+ | ~8 GB+ | CPU-heavy hybrid | too slow | - -### Recommended local models - -Two candidates worth testing, covering different points on the size/quality -tradeoff: - -**Gemma 4 E4B Instruct (Q4 / UD-Q4_K_XL)** -- GGUF file: `gemma-4-E4B-it-UD-Q4_K_XL.gguf` (~2.5 GB) -- Source: https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF -- Runs fully on GPU. Brand new (April 2025), built for edge hardware, 140+ - language support including all five pipeline languages. First candidate - to test. - -**Qwen2.5 7B Instruct (Q4_K_M)** -- GGUF file: `Qwen2.5-7B-Instruct-Q4_K_M.gguf` (~4.5 GB) -- Source: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF -- Runs in hybrid mode (~26 of 32 layers on GPU, rest on CPU), ~8–12 tok/s. - Stronger multilingual generation than any 3–4B model. Second candidate, - for comparison against the smaller Gemma 4 E4B. - -### Installation - -```bash -# Install build dependencies -sudo apt install build-essential cmake git - -# Clone llama.cpp -git clone https://github.com/ggerganov/llama.cpp -cd llama.cpp - -# Build with CUDA support (GTX 950M — compute 5.0) -cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=50 -cmake --build build --config Release -j$(nproc) - -# Download model (example — adjust path as needed) -mkdir -p models -wget -O models/qwen2.5-3b-instruct-q4_k_m.gguf \ - https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF/resolve/main/qwen2.5-3b-instruct-q4_k_m.gguf -``` - -### Starting the server - -**Gemma 4 E4B** (full GPU): -```bash -./build/bin/llama-server \ - --model models/gemma-4-e4b-it-ud-q4_k_xl.gguf \ - --port 8080 \ - --ctx-size 4096 \ - --n-gpu-layers 999 \ - --host 127.0.0.1 -``` - -**Qwen2.5 7B** (hybrid — tune `--n-gpu-layers` to fit your VRAM): -```bash -./build/bin/llama-server \ - --model models/qwen2.5-7b-instruct-q4_k_m.gguf \ - --port 8080 \ - --ctx-size 4096 \ - --n-gpu-layers 28 \ - --host 127.0.0.1 -``` - -`--n-gpu-layers 999` means "put everything on GPU" — llama.cpp caps at the -actual layer count automatically, so 999 is safe as a "full offload" value. -For the 7B hybrid, start with `28` and reduce by 2 if the server reports -out-of-memory at startup. - -### Verify the server is running - -```bash -curl http://127.0.0.1:8080/health -# Expected: {"status":"ok"} -``` - ---- - -## OpenRouter (free tier) - -OpenRouter exposes all models via an OpenAI-compatible API. No code changes -are needed to switch from local llama.cpp to OpenRouter — only the config -object changes. - -### Rate limits (free tier) - -- **50 requests per day** (account total, not per model) -- 20 requests per minute - -> **Implication for testing:** with a 10-record test set you have headroom -> to test 4–5 models per day. With a 100-record test set, plan one model per -> day. - -> **Implication for production:** the free tier is not viable for 117k -> records. If local quality is insufficient, use paid OpenRouter credits or -> a dedicated provider. - -### Free models recommended for this pipeline - -Ranked by expected multilingual generation quality for en/it/de/fr/es: - -| Model ID | Params | Notes | -|---|---|---| -| `qwen/qwen3-coder:free` | 480B MoE (35B active) | Best free option. Strong multilingual despite "coder" label. Use as quality ceiling. | -| `qwen/qwen3-next-80b-a3b-instruct:free` | 80B MoE (3B active) | Smaller Qwen, useful comparison point. | -| `nvidia/nemotron-3-super-120b-a12b:free` | 120B MoE (12B active) | 262K context, supports structured output. | -| `google/gemma-4-31b-it:free` | 31B | 140+ language support, good European language coverage. | -| `zhipuai/glm-4.5-air:free` | MoE | Multilingual-focused. | - -**Skip for this pipeline:** -- Llama models — weaker European language generation than Qwen/Gemma -- Mistral free tier — requests may be used for model training - -### API endpoint - -``` -https://openrouter.ai/api/v1/chat/completions -``` - -Set `Authorization: Bearer ` in the request headers. - ---- - -## Provider configuration in the test script - -The enrich test script reads a single config object. To switch providers, -change this object and re-run. - -```typescript -// config.ts - -export type ProviderConfig = { - name: string; // used for output folder naming - baseURL: string; - apiKey: string; - model: string; - maxTokens: number; -}; - -// Local llama.cpp -export const LOCAL_QWEN3B: ProviderConfig = { - name: "local-qwen2.5-3b", - baseURL: "http://127.0.0.1:8080/v1", - apiKey: "none", // llama.cpp ignores this - model: "qwen2.5-3b", // llama.cpp ignores model name, uses loaded model - maxTokens: 512, -}; - -// OpenRouter — Qwen3 480B (free) -export const OR_QWEN3_480B: ProviderConfig = { - name: "or-qwen3-480b", - baseURL: "https://openrouter.ai/api/v1", - apiKey: process.env.OPENROUTER_API_KEY!, - model: "qwen/qwen3-coder:free", - maxTokens: 512, -}; - -// OpenRouter — Gemma 4 31B (free) -export const OR_GEMMA4_31B: ProviderConfig = { - name: "or-gemma4-31b", - baseURL: "https://openrouter.ai/api/v1", - apiKey: process.env.OPENROUTER_API_KEY!, - model: "google/gemma-4-31b-it:free", - maxTokens: 512, -}; - -// Anthropic (reference baseline — different adapter required) -export const ANTHROPIC_SONNET: ProviderConfig = { - name: "anthropic-sonnet", - baseURL: "https://api.anthropic.com/v1", // adapter handles format difference - apiKey: process.env.ANTHROPIC_API_KEY!, - model: "claude-sonnet-4-6", - maxTokens: 512, -}; -``` - -Output from each run lands in: -``` -stage-3-enrich/test/output/{provider.name}/results.json -stage-3-enrich/test/output/{provider.name}/metrics.json -``` - -The evaluate script compares all `metrics.json` files side by side. - ---- - -## Evaluation metrics - -The test script measures the following per provider run: - -| Metric | What it measures | -|---|---| -| **JSON parse rate** | % of responses that are valid, schema-compliant JSON. Critical — a failed parse is a wasted call. Target: >97% | -| **Field coverage** | % of records where all required fields are present (cefr votes for all translations, descriptions for all languages, glosses/examples for fr/es) | -| **CEFR agreement** | For records that have a `cefr_source` vote, % where the model agrees. Measures calibration. | -| **Language correctness** | Manual spot-check only — automated detection not reliable enough | -| **Tokens/second** | Local only. Indicates overnight run feasibility | - -### Decision thresholds - -| Metric | Threshold | Action if below | -|---|---|---| -| JSON parse rate | < 97% | Do not use this model for production | -| Field coverage | < 95% | Prompt needs revision before production | -| CEFR agreement | < 70% | Model lacks vocabulary knowledge for this task | - ---- - -## Recommended test sequence - -1. **Start local, minimal dataset (5–10 records)** - Install llama.cpp, run Qwen2.5 3B against 5–10 hand-picked records. - Verify the server works, the output parses, and the model produces - something reasonable. This is purely a smoke test. - -2. **Expand local to full 100-record sample** - Once the pipeline is confirmed working, run all 100 records locally. - Collect metrics. This is your local quality baseline. - -3. **Run the same 100 records through OpenRouter free models** - One model per day (50 req/day limit). Start with `qwen/qwen3-coder:free` - as the quality ceiling. - -4. **Compare metrics side by side** - If local 3B is within acceptable range of the cloud models on CEFR - agreement and field coverage, proceed with local overnight runs for - production. If not, use the cloud model that passed. - -5. **Production run** - Full 117k records. Resume-safe — the script checkpoints after each - record so overnight runs can be stopped and continued.