feat(pipeline): add data pipeline workspace and extraction stage
- rename scripts/ to data-pipeline/, archive existing scripts - add @lila/pipeline as pnpm workspace package - add stage-1-extract through stage-5-compare folder structure - update SUPPORTED_LANGUAGE_CODES (add es, de, fr) - update SUPPORTED_POS (add adjective, adverb) - add description field to term_glosses - add term_examples table - run and verify db migration - write and verify extract.py (117,659 synsets across 5 languages) - write PIPELINE.md
This commit is contained in:
parent
e993aac711
commit
c9cddf68de
7 changed files with 1054164 additions and 33 deletions
204
data-pipeline/stage-1-extract/scripts/extract.py
Normal file
204
data-pipeline/stage-1-extract/scripts/extract.py
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
"""
|
||||
data-pipeline/stage-1-extract/scripts/extract.py
|
||||
|
||||
Extract all synsets from the Open Multilingual Wordnet (OMW) for all
|
||||
supported languages and parts of speech.
|
||||
|
||||
Output: one JSON file per language, written to stage-1-extract/output/
|
||||
en.json, it.json, es.json, de.json, fr.json
|
||||
|
||||
Each file is a JSON array of synset records:
|
||||
{
|
||||
"source_id": "ili:i12345",
|
||||
"pos": "noun",
|
||||
"translations": { "en": ["dog", "canine"], "it": ["cane"] },
|
||||
"glosses": { "en": ["a domesticated animal..."] },
|
||||
"examples": { "en": ["the dog barked at the stranger"] }
|
||||
}
|
||||
|
||||
Usage:
|
||||
python stage-1-extract/scripts/extract.py
|
||||
python stage-1-extract/scripts/extract.py --sample
|
||||
|
||||
Prerequisites:
|
||||
pip install wn
|
||||
python -m wn download omw-en:1.4
|
||||
python -m wn download omw-it:1.4
|
||||
python -m wn download omw-de:1.4
|
||||
python -m wn download omw-es:1.4
|
||||
python -m wn download omw-fr:1.4
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import wn
|
||||
|
||||
SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it", "es", "de", "fr"]
|
||||
POS_MAP: dict[str, str] = {
|
||||
"n": "noun",
|
||||
"v": "verb",
|
||||
"a": "adjective",
|
||||
"s": "adjective", # adjective satellite — collapsed into adjective
|
||||
"r": "adverb",
|
||||
}
|
||||
|
||||
|
||||
def extract_all(
|
||||
output_dir: str = "stage-1-extract/output", sample: bool = False
|
||||
) -> None:
|
||||
out = Path(output_dir)
|
||||
out.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
sample_size = 100 if sample else None
|
||||
|
||||
# Load one Wordnet object per language up front.
|
||||
print("Loading wordnets...")
|
||||
wordnets: dict[str, wn.Wordnet] = {}
|
||||
for lang in SUPPORTED_LANGUAGE_CODES:
|
||||
try:
|
||||
wordnets[lang] = wn.Wordnet(lang=lang)
|
||||
synset_count = len(wordnets[lang].synsets())
|
||||
print(f" {lang}: {synset_count:,} total synsets")
|
||||
except wn.Error as e:
|
||||
print(f" ERROR loading {lang}: {e}")
|
||||
print(f" Run: python -m wn download omw-{lang}:1.4")
|
||||
sys.exit(1)
|
||||
|
||||
# Collect per-ILI data across all languages and POS.
|
||||
print("\nExtracting synsets...")
|
||||
by_ili: dict[str, dict] = {}
|
||||
|
||||
for lang, wnet in wordnets.items():
|
||||
for omw_pos, pos_label in POS_MAP.items():
|
||||
synsets = wnet.synsets(pos=omw_pos)
|
||||
covered = 0
|
||||
for synset in synsets:
|
||||
ili = synset.ili
|
||||
if not ili:
|
||||
continue
|
||||
covered += 1
|
||||
|
||||
lemmas = [str(lemma) for lemma in synset.lemmas()]
|
||||
defns = [d for d in synset.definitions() if d]
|
||||
examples = [e for e in synset.examples() if e]
|
||||
|
||||
if ili not in by_ili:
|
||||
by_ili[ili] = {"pos": pos_label}
|
||||
|
||||
if lang not in by_ili[ili]:
|
||||
by_ili[ili][lang] = {
|
||||
"lemmas": lemmas,
|
||||
"glosses": defns,
|
||||
"examples": examples,
|
||||
}
|
||||
else:
|
||||
# ILI already exists for this language — merge data.
|
||||
# Happens when 'a' and 's' both map to adjective for the
|
||||
# same ILI. Deduplicate to avoid repeated entries.
|
||||
existing = by_ili[ili][lang]
|
||||
existing["lemmas"] = list(
|
||||
dict.fromkeys(existing["lemmas"] + lemmas)
|
||||
)
|
||||
existing["glosses"] = list(
|
||||
dict.fromkeys(existing["glosses"] + defns)
|
||||
)
|
||||
existing["examples"] = list(
|
||||
dict.fromkeys(existing["examples"] + examples)
|
||||
)
|
||||
|
||||
print(f" {lang} {pos_label}: {covered:,} synsets with ILI")
|
||||
|
||||
# Build records and write single combined output file.
|
||||
print("\nBuilding records...")
|
||||
ilis = sorted(by_ili.keys())
|
||||
if sample_size:
|
||||
ilis = ilis[:sample_size]
|
||||
|
||||
records: list[dict] = []
|
||||
for ili in ilis:
|
||||
data = by_ili[ili]
|
||||
record: dict = {
|
||||
"source_id": f"ili:{ili}",
|
||||
"pos": data["pos"],
|
||||
"translations": {},
|
||||
"glosses": {},
|
||||
"examples": {},
|
||||
}
|
||||
|
||||
for key, value in data.items():
|
||||
if key == "pos":
|
||||
continue
|
||||
lang = key
|
||||
if value["lemmas"]:
|
||||
record["translations"][lang] = value["lemmas"]
|
||||
if value["glosses"]:
|
||||
record["glosses"][lang] = value["glosses"]
|
||||
if value["examples"]:
|
||||
record["examples"][lang] = value["examples"]
|
||||
|
||||
records.append(record)
|
||||
|
||||
output_file = out / "omw.json"
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\nWrote {len(records):,} synsets → {output_file}")
|
||||
_print_coverage(records)
|
||||
|
||||
|
||||
def _print_coverage(records: list[dict]) -> None:
|
||||
"""Print per-language translation, gloss, and example counts."""
|
||||
lang_stats: dict[str, dict[str, int]] = {}
|
||||
for lang in SUPPORTED_LANGUAGE_CODES:
|
||||
lang_stats[lang] = {"translations": 0, "glosses": 0, "examples": 0}
|
||||
|
||||
pos_stats: dict[str, int] = {}
|
||||
|
||||
for r in records:
|
||||
pos = r["pos"]
|
||||
pos_stats[pos] = pos_stats.get(pos, 0) + 1
|
||||
|
||||
for lang, lemmas in r["translations"].items():
|
||||
if lang in lang_stats:
|
||||
lang_stats[lang]["translations"] += len(lemmas)
|
||||
for lang, gloss_list in r["glosses"].items():
|
||||
if lang in lang_stats:
|
||||
lang_stats[lang]["glosses"] += len(gloss_list)
|
||||
for lang, example_list in r["examples"].items():
|
||||
if lang in lang_stats:
|
||||
lang_stats[lang]["examples"] += len(example_list)
|
||||
|
||||
print("\nPOS breakdown:")
|
||||
for pos, count in sorted(pos_stats.items()):
|
||||
print(f" {pos}: {count:,}")
|
||||
|
||||
print("\nCoverage per language:")
|
||||
for lang, counts in lang_stats.items():
|
||||
t = counts["translations"]
|
||||
g = counts["glosses"]
|
||||
e = counts["examples"]
|
||||
total = len(records)
|
||||
print(
|
||||
f" {lang}: {t:,} translations, {g:,} glosses, {e:,} examples (avg {(t / total):.1f} translations/synset)"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Extract OMW data to JSON")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default="stage-1-extract/output",
|
||||
help="Output directory for JSON files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sample",
|
||||
action="store_true",
|
||||
help="Extract only 100 synsets per language for inspection",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
extract_all(output_dir=args.output_dir, sample=args.sample)
|
||||
324482
data-pipeline/stage-2-annotate/sources/cefr/de.json
Normal file
324482
data-pipeline/stage-2-annotate/sources/cefr/de.json
Normal file
File diff suppressed because it is too large
Load diff
186374
data-pipeline/stage-2-annotate/sources/cefr/en.json
Normal file
186374
data-pipeline/stage-2-annotate/sources/cefr/en.json
Normal file
File diff suppressed because it is too large
Load diff
163922
data-pipeline/stage-2-annotate/sources/cefr/es.json
Normal file
163922
data-pipeline/stage-2-annotate/sources/cefr/es.json
Normal file
File diff suppressed because it is too large
Load diff
193382
data-pipeline/stage-2-annotate/sources/cefr/fr.json
Normal file
193382
data-pipeline/stage-2-annotate/sources/cefr/fr.json
Normal file
File diff suppressed because it is too large
Load diff
185759
data-pipeline/stage-2-annotate/sources/cefr/it.json
Normal file
185759
data-pipeline/stage-2-annotate/sources/cefr/it.json
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue