diff --git a/.gitea/workflows/docker-publish.yml b/.gitea/workflows/docker-publish.yml index 8ef5f68..7ca3adc 100644 --- a/.gitea/workflows/docker-publish.yml +++ b/.gitea/workflows/docker-publish.yml @@ -5,6 +5,15 @@ on: tags: - 'v*' +# Serialize concurrent runs of the same workflow on the same ref so the +# matrix build jobs can't race `docker system prune` in the smoke gates +# (pruning from one job can nuke another job's in-flight buildx cache). +# cancel-in-progress: false — tag pushes are release events, we never +# want to silently drop one. +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: false + # Runner disk pressure notes: # Gitea Actions runners use `catthehacker/ubuntu:act-latest` on a shared host # with limited overlay space (~40 GB, often 70%+ used at start). Building both @@ -29,6 +38,34 @@ jobs: - name: Force IPv4 for Docker Hub run: echo 'precedence ::ffff:0:0/96 100' >> /etc/gai.conf + # See docker-publish.yml preamble. `load: true` peak disk = tarball + # + unpacked image + buildx cache; the image now crosses the 40 GB + # runner overlay's starting headroom. Strip catthehacker-resident + # toolchains and any stale docker state up front. + - name: Reclaim runner disk + run: | + set -x + df -h / || true + rm -rf \ + /opt/hostedtoolcache \ + /opt/microsoft \ + /opt/az \ + /opt/ghc \ + /usr/local/.ghcup \ + /usr/share/dotnet \ + /usr/share/swift \ + /usr/local/lib/android \ + /usr/local/share/powershell \ + /usr/local/share/chromium \ + /usr/local/share/boost \ + /usr/lib/jvm 2>/dev/null || true + apt-get clean || true + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* || true + docker system df || true + docker system prune -af --volumes || true + docker builder prune -af || true + df -h / || true + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v4 with: @@ -57,6 +94,30 @@ jobs: - name: Force IPv4 for Docker Hub run: echo 'precedence ::ffff:0:0/96 100' >> /etc/gai.conf + - name: Reclaim runner disk + run: | + set -x + df -h / || true + rm -rf \ + /opt/hostedtoolcache \ + /opt/microsoft \ + /opt/az \ + /opt/ghc \ + /usr/local/.ghcup \ + /usr/share/dotnet \ + /usr/share/swift \ + /usr/local/lib/android \ + /usr/local/share/powershell \ + /usr/local/share/chromium \ + /usr/local/share/boost \ + /usr/lib/jvm 2>/dev/null || true + apt-get clean || true + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* || true + docker system df || true + docker system prune -af --volumes || true + docker builder prune -af || true + df -h / || true + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v4 with: diff --git a/.gitea/workflows/validate.yml b/.gitea/workflows/validate.yml index 9d0eb06..aeb626c 100644 --- a/.gitea/workflows/validate.yml +++ b/.gitea/workflows/validate.yml @@ -46,6 +46,34 @@ jobs: run: | echo 'precedence ::ffff:0:0/96 100' >> /etc/gai.conf + # The runner's overlay disk starts ~70% full. `load: true` peak disk + # is tarball + unpacked image + buildx cache, which tips it over + # once the image crosses ~3 GB. Strip catthehacker-resident + # toolchains we never use and any stale docker state up front. + - name: Reclaim runner disk + run: | + set -x + df -h / || true + rm -rf \ + /opt/hostedtoolcache \ + /opt/microsoft \ + /opt/az \ + /opt/ghc \ + /usr/local/.ghcup \ + /usr/share/dotnet \ + /usr/share/swift \ + /usr/local/lib/android \ + /usr/local/share/powershell \ + /usr/local/share/chromium \ + /usr/local/share/boost \ + /usr/lib/jvm 2>/dev/null || true + apt-get clean || true + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* || true + docker system df || true + docker system prune -af --volumes || true + docker builder prune -af || true + df -h / || true + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v4 with: @@ -76,6 +104,30 @@ jobs: run: | echo 'precedence ::ffff:0:0/96 100' >> /etc/gai.conf + - name: Reclaim runner disk + run: | + set -x + df -h / || true + rm -rf \ + /opt/hostedtoolcache \ + /opt/microsoft \ + /opt/az \ + /opt/ghc \ + /usr/local/.ghcup \ + /usr/share/dotnet \ + /usr/share/swift \ + /usr/local/lib/android \ + /usr/local/share/powershell \ + /usr/local/share/chromium \ + /usr/local/share/boost \ + /usr/lib/jvm 2>/dev/null || true + apt-get clean || true + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* || true + docker system df || true + docker system prune -af --volumes || true + docker builder prune -af || true + df -h / || true + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v4 with: diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b32694..3da9343 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,17 @@ Tags follow `v{opencode_version}[letter]` — bare tag for the first build on a --- +## v1.14.31b — 2026-05-01 + +**CI: reclaim runner disk before `load: true` smoke builds.** + +- **Fix:** v1.14.31's publish workflow and the `validate` workflow both hit `No space left on device` on the single-arch amd64 smoke/validate builds (`/opt/uv-tools/mempalace/lib/python3.13/site-packages/hf_xet/hf_xet.abi3.so`, `/usr/local/bin/git-lfs`). Root cause is not the build itself but the `load: true` step: peak disk during export equals tarball + unpacked image + buildx cache, and the image has crossed the ~3 GB threshold where this no longer fits in the ~12 GB of free space the runner container starts with. The v1.14.30c refactor split multi-arch into per-arch push-by-digest jobs (which don't `load`), but the smoke gates still do and still hit the wall. + - Added a `Reclaim runner disk` step to all four `load: true` jobs (`validate-base`, `validate-omos`, `smoke-base`, `smoke-omos`). The step strips `catthehacker/ubuntu:act-latest`-resident toolchains we never use (hosted-tool-cache, dotnet, android, powershell, swift, ghc, jvm, microsoft, chromium, boost) and runs `docker system prune -af --volumes` + `docker builder prune -af` against the runner's dockerd before `setup-buildx-action`. Expected reclaim is 6–12 GB depending on what's resident. + - Added workflow-level `concurrency: { group: ..., cancel-in-progress: false }` on `docker-publish.yml` so concurrent tag pushes can't race `docker system prune` in one job against an in-flight buildx cache in another. + - Pruning is deliberately kept out of the per-arch matrix push-by-digest jobs (`build-base`/`build-omos`) — those don't need it (no `load: true`), and pruning in parallel jobs risks one job nuking another's cache. +- **Follow-up** (not in this release): image-size reduction via a dedicated `uv tool install mempalace` build stage (strips uv's cache from the final image), pinning `mempalace-toolkit` to a commit SHA with `--depth=1 --filter=blob:none`, and auditing whether `hf_xet` is actually required by mempalace at runtime. These will ship in the next release that rebases on a new opencode version. +- No image changes. Rebuild of v1.14.31 content only. + ## v1.14.31 — 2026-05-01 Bump opencode to 1.14.31.