diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b3c98c9..dd9f34f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,6 +41,33 @@ jobs: - name: Test (pytest) run: uv run pytest -q + dist: + name: Build & validate dist artifacts + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + + # Build the sdist and wheel exactly as a release would. This is the only + # CI path that exercises packaging, so cruft or a missing-asset build + # surfaces here, in PR, instead of at the manual `twine upload`. + - name: Build sdist + wheel + run: uv build + + # Long-description / metadata validity (the PyPI render gate). + - name: twine check + run: uvx twine check dist/* + + # Content + size asserts: vendored runtime assets present, no tests/cache/ + # cruft in the wheel, no node_modules/cargo-target/junk in the sdist, both + # under their size caps. Same script that runs locally after `uv build`. + - name: Validate dist contents + run: uv run python scripts/validate-dist.py + js-tests: name: JS tests (Jest) runs-on: ubuntu-latest diff --git a/CHANGELOG.md b/CHANGELOG.md index a264c71..3f48f3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,35 @@ All notable changes to scolta-python are documented here. ## [Unreleased] +### Added +- **CI now builds and validates the PyPI artifacts (`dist` job in + `ci.yml`).** Publishing is manual and nothing in CI built the sdist/wheel, so + packaging breakage or cruft was only found at `twine upload` time. The job + runs `uv build`, `twine check dist/*` (metadata/long-description validity), + and a new `scripts/validate-dist.py` content gate (runs locally too, after + `uv build`). The wheel gate asserts every vendored browser runtime asset is + present (`assets/css/scolta.css`, `assets/js/scolta.js`, + `assets/pagefind/{pagefind-worker.js,pagefind.js,wasm.en.pagefind,wasm.unknown.pagefind}`, + `assets/wasm/{scolta_core.js,scolta_core_bg.wasm}`) — the failure mode that + matters most, a wheel that imports but ships no search UI because + `vendor_assets.py` was not run — and that nothing lives outside the `scolta` + package and dist-info (no `tests/`, `__pycache__`, `*.pyc`, or + `.sha256`/`.d.ts`/`.map` sidecars). The sdist gate asserts a buildable source + set with no local build-dir junk. Size caps (~2x the measured good artifacts: + wheel 1.5 MB cap vs ~712 KiB, sdist 4.7 MB cap vs ~2.24 MiB) catch a bloat + regression. Mirrors the dist-cruft precedent from the scolta-wp 13 MB zip + incident and the WP.org compliance flags. + ### Fixed +- **The sdist no longer ships local-only build directories.** Hatchling's + source distribution defaulted to "everything on disk except VCS-ignored", + which pulled `tests/js/node_modules` (61 MB of vendored npm packages, + including `.idea` IDE files) and `tools/stemmer-golden/target` (Rust build + artifacts) into the tarball — a 7.9 MB sdist. A new + `[tool.hatch.build.targets.sdist]` `exclude` list (enumerated, fail-closed) + prunes those plus caches/IDE/`.pyc` junk, dropping the sdist to ~2.35 MB + while keeping the full ported test corpus and stemmer fixtures. The wheel was + already clean (`packages = ["src/scolta"]`). - **Re-vendored the browser bundle (`scolta.js`/`scolta.css`) from scolta-php `main`, picking up three client-side fixes that had not yet reached the Python binding.** scolta-php #217 stops the sub-word frequency guard from diff --git a/pyproject.toml b/pyproject.toml index 2cb51c2..0c685c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,29 @@ path = "src/scolta/__init__.py" [tool.hatch.build.targets.wheel] packages = ["src/scolta"] +# The sdist must carry the buildable source set (src/, tests/, docs/, the parity +# harnesses and the stemmer-golden tool source) but NOT the local-only build +# directories that live untracked inside them. Hatchling's sdist defaults to +# "everything on disk except VCS-ignored", but it does NOT prune nested, +# separately-managed trees like tests/js/node_modules (61 MB of vendored npm +# packages, including .idea IDE files) or tools/stemmer-golden/target (1.5 MB of +# Rust build artifacts) reliably across environments — so we enumerate the +# excludes explicitly here, fail-closed. If the sdist ever bloats again, this +# is the filter to fix. scripts/validate-dist.py asserts the result. +[tool.hatch.build.targets.sdist] +exclude = [ + "**/node_modules", + "tools/**/target", + "**/__pycache__", + "**/*.pyc", + ".pytest_cache", + ".ruff_cache", + "**/.idea", + "**/.vscode", + ".venv", + "**/.DS_Store", +] + [tool.pytest.ini_options] testpaths = ["tests"] markers = [ diff --git a/scripts/validate-dist.py b/scripts/validate-dist.py new file mode 100755 index 0000000..cbe683e --- /dev/null +++ b/scripts/validate-dist.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +"""Validate the built PyPI artifacts (wheel + sdist) before they ship. + +Publishing scolta to PyPI is manual, and nothing else in CI builds the +distribution artifacts, so packaging breakage or cruft used to surface only at +`twine upload` time. Two specific failure modes motivate this gate: + + 1. The wheel silently shipping WITHOUT the vendored browser runtime assets + (e.g. ``scripts/vendor_assets.py`` was never run, or hatch quietly dropped + the data files) — a wheel that imports fine but renders no search UI. + 2. The sdist ballooning with local-only build dirs. The Python sdist defaults + to "everything on disk minus VCS-ignored", which pulled in + ``tests/js/node_modules`` (61 MB) and ``tools/stemmer-golden/target`` + (Rust artifacts) — the same class of bloat as the scolta-wp 13 MB plugin + zip and the WP.org dist-cruft flags. + +This runs the asserts against a REAL build so they hold locally and in CI: + uv build && uv run python scripts/validate-dist.py + +The fail-closed lists below are OURS to enumerate. If an assert fires, the +message says WHAT leaked/is missing and WHERE the filter that controls it lives +(pyproject.toml's hatch targets, or scripts/vendor_assets.py). +""" + +from __future__ import annotations + +import sys +import tarfile +import zipfile +from pathlib import Path + +# --- size caps (shared pattern: ~2x the measured good artifact) -------------- +# Measured 2026-06-14 against a clean `uv build` on main + this PR: +# wheel = 729_525 bytes (~712 KiB; dominated by scolta_core_bg.wasm ~1.2 MB +# uncompressed, the vendored js, and pagefind .pagefind blobs) +# sdist = 2_349_924 bytes (~2.24 MiB; src + the full ported test corpus and +# stemmer fixtures, with node_modules/target excluded) +# Caps are ~2x those measured values, leaving headroom for asset growth while +# still catching a node_modules/target/cruft regression an order of magnitude +# bigger. +WHEEL_MAX_BYTES = 1_500_000 # ~2x of 729_525 +SDIST_MAX_BYTES = 4_700_000 # ~2x of 2_349_924 + +# --- vendored runtime assets that MUST be in the wheel ----------------------- +# Enumerated from `scripts/vendor_assets.py` (_SUBDIRS x allowed extensions) and +# the assets/ tree. These are the browser-side runtime: missing any of them +# yields an importable-but-non-functional package. The filter that produces +# them lives in scripts/vendor_assets.py (re-vendoring) and +# [tool.hatch.build.targets.wheel] packages = ["src/scolta"] (inclusion). +REQUIRED_WHEEL_ASSETS = ( + "scolta/assets/css/scolta.css", + "scolta/assets/js/scolta.js", + "scolta/assets/pagefind/pagefind-worker.js", + "scolta/assets/pagefind/pagefind.js", + "scolta/assets/pagefind/wasm.en.pagefind", + "scolta/assets/pagefind/wasm.unknown.pagefind", + "scolta/assets/wasm/scolta_core.js", + "scolta/assets/wasm/scolta_core_bg.wasm", +) + +# Sidecar/checksum files vendor_assets.py is fail-closed against — they must +# never reach the wheel even if they appear in the source asset tree. +FORBIDDEN_ASSET_SUFFIXES = (".sha256", ".d.ts", ".map", ".log") + + +def _fail(msg: str) -> None: + print(f" FAIL: {msg}", file=sys.stderr) + + +def validate_wheel(path: Path) -> list[str]: + errors: list[str] = [] + size = path.stat().st_size + print(f"wheel: {path.name} ({size:,} bytes)") + if size > WHEEL_MAX_BYTES: + errors.append( + f"wheel is {size:,} bytes, over the {WHEEL_MAX_BYTES:,} cap " + "(raise it in scripts/validate-dist.py only if the asset growth is " + "intentional)" + ) + + with zipfile.ZipFile(path) as zf: + names = zf.namelist() + + dist_info = next((n.split("/", 1)[0] for n in names if n.endswith(".dist-info/RECORD")), None) + + # (a) every vendored runtime asset is present + present = set(names) + for asset in REQUIRED_WHEEL_ASSETS: + if asset not in present: + errors.append( + f"vendored asset MISSING from wheel: {asset} -- run " + "`python scripts/vendor_assets.py` and confirm " + "[tool.hatch.build.targets.wheel] in pyproject.toml ships it" + ) + + # (b) nothing outside the scolta package or the dist-info lives in the wheel + for name in names: + if name.endswith("/"): + continue + top = name.split("/", 1)[0] + if top == "scolta" or (dist_info and top == dist_info): + continue + errors.append( + f"unexpected file in wheel outside the scolta package/dist-info: {name} " + "-- tighten [tool.hatch.build.targets.wheel] in pyproject.toml" + ) + + # (c) no tests, caches, pyc, or sidecar/checksum cruft + for name in names: + low = name.lower() + if name.startswith("scolta/tests/") or "/tests/" in name: + errors.append(f"tests/ leaked into wheel: {name} -- exclude in hatch wheel target") + if "__pycache__" in name or low.endswith(".pyc"): + errors.append(f"bytecode/pycache leaked into wheel: {name}") + if ".pytest_cache" in name or ".ruff_cache" in name: + errors.append(f"tool cache leaked into wheel: {name}") + if any(low.endswith(suffix) for suffix in FORBIDDEN_ASSET_SUFFIXES): + errors.append( + f"sidecar/checksum file leaked into wheel: {name} -- " + "scripts/vendor_assets.py allowlist should have blocked it" + ) + return errors + + +def validate_sdist(path: Path) -> list[str]: + errors: list[str] = [] + size = path.stat().st_size + print(f"sdist: {path.name} ({size:,} bytes)") + if size > SDIST_MAX_BYTES: + errors.append( + f"sdist is {size:,} bytes, over the {SDIST_MAX_BYTES:,} cap -- the " + "usual cause is a local build dir (node_modules / cargo target) " + "leaking; tighten [tool.hatch.build.targets.sdist].exclude in " + "pyproject.toml" + ) + + with tarfile.open(path, "r:gz") as tf: + names = tf.getnames() + + # strip the leading "scolta-/" prefix for clearer matching + rel = [] + for name in names: + parts = name.split("/", 1) + rel.append(parts[1] if len(parts) == 2 else parts[0]) + + # buildable source set must be present + must_have = ("pyproject.toml", "src/scolta/__init__.py", "PKG-INFO") + for needed in must_have: + if needed not in rel: + errors.append(f"sdist is not buildable: missing {needed}") + if not any(r.startswith("src/scolta/assets/") for r in rel): + errors.append("sdist carries no src/scolta/assets/ — vendored assets missing from source") + + # no junk: local build dirs, caches, IDE files + for name, r in zip(names, rel, strict=True): + low = r.lower() + if "node_modules" in r: + errors.append( + f"node_modules leaked into sdist: {name} -- exclude in " + "[tool.hatch.build.targets.sdist] in pyproject.toml" + ) + if r.startswith("tools/") and "/target/" in r: + errors.append( + f"cargo build target leaked into sdist: {name} -- exclude " + "tools/**/target in the hatch sdist target" + ) + if "__pycache__" in r or low.endswith(".pyc"): + errors.append(f"bytecode/pycache leaked into sdist: {name}") + if ".pytest_cache" in r or ".ruff_cache" in r: + errors.append(f"tool cache leaked into sdist: {name}") + if "/.idea/" in f"/{r}" or "/.vscode/" in f"/{r}": + errors.append(f"IDE config leaked into sdist: {name}") + if low.endswith(".ds_store"): + errors.append(f".DS_Store leaked into sdist: {name}") + return errors + + +def main() -> int: + dist = Path(__file__).resolve().parent.parent / "dist" + wheels = sorted(dist.glob("*.whl")) + sdists = sorted(dist.glob("*.tar.gz")) + + if not wheels: + print(f"no wheel found in {dist} -- run `uv build` first", file=sys.stderr) + return 1 + if not sdists: + print(f"no sdist found in {dist} -- run `uv build` first", file=sys.stderr) + return 1 + + errors: list[str] = [] + for wheel in wheels: + errors.extend(validate_wheel(wheel)) + for sdist in sdists: + errors.extend(validate_sdist(sdist)) + + if errors: + print(f"\nDistribution validation FAILED ({len(errors)} problem(s)):", file=sys.stderr) + for err in errors: + _fail(err) + return 1 + + print("\nDistribution validation passed.") + return 0 + + +if __name__ == "__main__": + sys.exit(main())