feat(pipeline): add data pipeline workspace and extraction stage

- rename scripts/ to data-pipeline/, archive existing scripts
- add @lila/pipeline as pnpm workspace package
- add stage-1-extract through stage-5-compare folder structure
- update SUPPORTED_LANGUAGE_CODES (add es, de, fr)
- update SUPPORTED_POS (add adjective, adverb)
- add description field to term_glosses
- add term_examples table
- run and verify db migration
- write and verify extract.py (117,659 synsets across 5 languages)
- write PIPELINE.md
This commit is contained in:
lila 2026-04-21 09:39:36 +02:00
parent e993aac711
commit c9cddf68de
7 changed files with 1054164 additions and 33 deletions

View file

@ -0,0 +1,204 @@
"""
data-pipeline/stage-1-extract/scripts/extract.py
Extract all synsets from the Open Multilingual Wordnet (OMW) for all
supported languages and parts of speech.
Output: one JSON file per language, written to stage-1-extract/output/
en.json, it.json, es.json, de.json, fr.json
Each file is a JSON array of synset records:
{
"source_id": "ili:i12345",
"pos": "noun",
"translations": { "en": ["dog", "canine"], "it": ["cane"] },
"glosses": { "en": ["a domesticated animal..."] },
"examples": { "en": ["the dog barked at the stranger"] }
}
Usage:
python stage-1-extract/scripts/extract.py
python stage-1-extract/scripts/extract.py --sample
Prerequisites:
pip install wn
python -m wn download omw-en:1.4
python -m wn download omw-it:1.4
python -m wn download omw-de:1.4
python -m wn download omw-es:1.4
python -m wn download omw-fr:1.4
"""
import json
import sys
from pathlib import Path
import wn
SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it", "es", "de", "fr"]
POS_MAP: dict[str, str] = {
"n": "noun",
"v": "verb",
"a": "adjective",
"s": "adjective", # adjective satellite — collapsed into adjective
"r": "adverb",
}
def extract_all(
output_dir: str = "stage-1-extract/output", sample: bool = False
) -> None:
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
sample_size = 100 if sample else None
# Load one Wordnet object per language up front.
print("Loading wordnets...")
wordnets: dict[str, wn.Wordnet] = {}
for lang in SUPPORTED_LANGUAGE_CODES:
try:
wordnets[lang] = wn.Wordnet(lang=lang)
synset_count = len(wordnets[lang].synsets())
print(f" {lang}: {synset_count:,} total synsets")
except wn.Error as e:
print(f" ERROR loading {lang}: {e}")
print(f" Run: python -m wn download omw-{lang}:1.4")
sys.exit(1)
# Collect per-ILI data across all languages and POS.
print("\nExtracting synsets...")
by_ili: dict[str, dict] = {}
for lang, wnet in wordnets.items():
for omw_pos, pos_label in POS_MAP.items():
synsets = wnet.synsets(pos=omw_pos)
covered = 0
for synset in synsets:
ili = synset.ili
if not ili:
continue
covered += 1
lemmas = [str(lemma) for lemma in synset.lemmas()]
defns = [d for d in synset.definitions() if d]
examples = [e for e in synset.examples() if e]
if ili not in by_ili:
by_ili[ili] = {"pos": pos_label}
if lang not in by_ili[ili]:
by_ili[ili][lang] = {
"lemmas": lemmas,
"glosses": defns,
"examples": examples,
}
else:
# ILI already exists for this language — merge data.
# Happens when 'a' and 's' both map to adjective for the
# same ILI. Deduplicate to avoid repeated entries.
existing = by_ili[ili][lang]
existing["lemmas"] = list(
dict.fromkeys(existing["lemmas"] + lemmas)
)
existing["glosses"] = list(
dict.fromkeys(existing["glosses"] + defns)
)
existing["examples"] = list(
dict.fromkeys(existing["examples"] + examples)
)
print(f" {lang} {pos_label}: {covered:,} synsets with ILI")
# Build records and write single combined output file.
print("\nBuilding records...")
ilis = sorted(by_ili.keys())
if sample_size:
ilis = ilis[:sample_size]
records: list[dict] = []
for ili in ilis:
data = by_ili[ili]
record: dict = {
"source_id": f"ili:{ili}",
"pos": data["pos"],
"translations": {},
"glosses": {},
"examples": {},
}
for key, value in data.items():
if key == "pos":
continue
lang = key
if value["lemmas"]:
record["translations"][lang] = value["lemmas"]
if value["glosses"]:
record["glosses"][lang] = value["glosses"]
if value["examples"]:
record["examples"][lang] = value["examples"]
records.append(record)
output_file = out / "omw.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
print(f"\nWrote {len(records):,} synsets → {output_file}")
_print_coverage(records)
def _print_coverage(records: list[dict]) -> None:
"""Print per-language translation, gloss, and example counts."""
lang_stats: dict[str, dict[str, int]] = {}
for lang in SUPPORTED_LANGUAGE_CODES:
lang_stats[lang] = {"translations": 0, "glosses": 0, "examples": 0}
pos_stats: dict[str, int] = {}
for r in records:
pos = r["pos"]
pos_stats[pos] = pos_stats.get(pos, 0) + 1
for lang, lemmas in r["translations"].items():
if lang in lang_stats:
lang_stats[lang]["translations"] += len(lemmas)
for lang, gloss_list in r["glosses"].items():
if lang in lang_stats:
lang_stats[lang]["glosses"] += len(gloss_list)
for lang, example_list in r["examples"].items():
if lang in lang_stats:
lang_stats[lang]["examples"] += len(example_list)
print("\nPOS breakdown:")
for pos, count in sorted(pos_stats.items()):
print(f" {pos}: {count:,}")
print("\nCoverage per language:")
for lang, counts in lang_stats.items():
t = counts["translations"]
g = counts["glosses"]
e = counts["examples"]
total = len(records)
print(
f" {lang}: {t:,} translations, {g:,} glosses, {e:,} examples (avg {(t / total):.1f} translations/synset)"
)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Extract OMW data to JSON")
parser.add_argument(
"--output-dir",
default="stage-1-extract/output",
help="Output directory for JSON files",
)
parser.add_argument(
"--sample",
action="store_true",
help="Extract only 100 synsets per language for inspection",
)
args = parser.parse_args()
extract_all(output_dir=args.output_dir, sample=args.sample)

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff