From ed2c899915aa1c2e29bf9efe5e4f744ea15b03b7 Mon Sep 17 00:00:00 2001 From: Petru Paler Date: Mon, 5 Jan 2026 07:47:01 +0000 Subject: [PATCH] Add reusable CI/CD workflow and documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - .gitea/workflows/deploy-nomad.yaml: Shared workflow for build/push/deploy - docs/CICD_SETUP.md: Guide for adding CI/CD to new services - nix-runner/README.md: Document the custom Nix runner image Services can now use a 10-line workflow that calls the shared one: uses: ppetru/alo-cluster/.gitea/workflows/deploy-nomad.yaml@master 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .gitea/workflows/deploy-nomad.yaml | 96 ++++++++++++++ docs/CICD_SETUP.md | 206 +++++++++++++++++++++++++++++ nix-runner/README.md | 100 ++++++++++++++ 3 files changed, 402 insertions(+) create mode 100644 .gitea/workflows/deploy-nomad.yaml create mode 100644 docs/CICD_SETUP.md create mode 100644 nix-runner/README.md diff --git a/.gitea/workflows/deploy-nomad.yaml b/.gitea/workflows/deploy-nomad.yaml new file mode 100644 index 0000000..f50199c --- /dev/null +++ b/.gitea/workflows/deploy-nomad.yaml @@ -0,0 +1,96 @@ +# ABOUTME: Reusable workflow for building Nix Docker images and deploying to Nomad. +# ABOUTME: Called by service repos with: uses: ppetru/alo-cluster/.gitea/workflows/deploy-nomad.yaml@master + +name: Deploy to Nomad + +on: + workflow_call: + inputs: + service_name: + required: true + type: string + description: "Nomad job name (must match job ID in services/*.hcl)" + flake_output: + required: false + type: string + default: "dockerImage" + description: "Flake output to build (default: dockerImage)" + registry: + required: false + type: string + default: "gitea.v.paler.net" + description: "Container registry hostname" + secrets: + REGISTRY_USERNAME: + required: true + REGISTRY_PASSWORD: + required: true + NOMAD_ADDR: + required: true + +jobs: + build-and-deploy: + runs-on: nix + steps: + - uses: actions/checkout@v4 + + - name: Build Docker image + run: | + echo "Building .#${{ inputs.flake_output }}..." + nix build ".#${{ inputs.flake_output }}" --out-link result + + - name: Push to registry + run: | + echo "Pushing to ${{ inputs.registry }}/ppetru/${{ inputs.service_name }}:latest..." + skopeo copy \ + --dest-creds "${{ secrets.REGISTRY_USERNAME }}:${{ secrets.REGISTRY_PASSWORD }}" \ + --insecure-policy \ + docker-archive:result \ + "docker://${{ inputs.registry }}/ppetru/${{ inputs.service_name }}:latest" + + - name: Deploy to Nomad + env: + NOMAD_ADDR: ${{ secrets.NOMAD_ADDR }} + SERVICE: ${{ inputs.service_name }} + run: | + echo "Deploying $SERVICE to Nomad..." + + # Fetch current job, update UUID to force deployment + JOB=$(curl -sS "$NOMAD_ADDR/v1/job/$SERVICE") + NEW_UUID=$(cat /proc/sys/kernel/random/uuid) + echo "New deployment UUID: $NEW_UUID" + UPDATED_JOB=$(echo "$JOB" | jq --arg uuid "$NEW_UUID" '.Meta.uuid = $uuid') + + # Submit updated job + RESULT=$(echo "{\"Job\": $UPDATED_JOB}" | curl -sS -X POST "$NOMAD_ADDR/v1/jobs" \ + -H "Content-Type: application/json" -d @-) + echo "Submit result: $RESULT" + + # Monitor deployment + sleep 3 + DEPLOY_ID=$(curl -sS "$NOMAD_ADDR/v1/job/$SERVICE/deployments" | jq -r '.[0].ID') + echo "Deployment ID: $DEPLOY_ID" + + if [ "$DEPLOY_ID" = "null" ]; then + echo "ERROR: No deployment created. Ensure job has 'update' stanza with 'auto_revert = true'" + exit 1 + fi + + echo "Monitoring deployment..." + for i in $(seq 1 30); do + STATUS=$(curl -sS "$NOMAD_ADDR/v1/deployment/$DEPLOY_ID" | jq -r '.Status') + echo "[$i/30] Deployment status: $STATUS" + case $STATUS in + successful) + echo "Deployment successful!" + exit 0 + ;; + failed|cancelled) + echo "Deployment failed or cancelled" + exit 1 + ;; + esac + sleep 10 + done + echo "Timeout waiting for deployment" + exit 1 diff --git a/docs/CICD_SETUP.md b/docs/CICD_SETUP.md new file mode 100644 index 0000000..076f2e3 --- /dev/null +++ b/docs/CICD_SETUP.md @@ -0,0 +1,206 @@ +# CI/CD Setup for Nomad Services + +Guide for adding automated builds and deployments to a service. + +## Prerequisites + +### 1. Service Repository + +Your service needs a `flake.nix` that exports a Docker image: + +```nix +{ + outputs = { self, nixpkgs, ... }: { + # The workflow looks for this output by default + dockerImage = pkgs.dockerTools.buildImage { + name = "gitea.v.paler.net/ppetru/"; + tag = "latest"; + # ... image config + }; + }; +} +``` + +**Important**: Use `extraCommands` instead of `runAsRoot` in your Docker build - the CI runner doesn't have KVM. + +### 2. Nomad Job + +Your job in `services/.hcl` needs: + +```hcl +job "" { + # Required: UUID changes trigger deployments + meta { + uuid = uuidv4() + } + + # Required: enables deployment tracking and auto-rollback + update { + max_parallel = 1 + health_check = "checks" + min_healthy_time = "30s" + healthy_deadline = "5m" + auto_revert = true + } + + # Required: pulls new image on each deployment + task "app" { + config { + force_pull = true + } + + # Recommended: health check for deployment validation + service { + check { + type = "http" + path = "/healthz" + interval = "10s" + timeout = "5s" + } + } + } +} +``` + +## Quick Start + +### 1. Create Workflow + +Add `.gitea/workflows/deploy.yaml` to your service repo: + +```yaml +name: Deploy + +on: + push: + branches: [master] + workflow_dispatch: + +jobs: + deploy: + uses: ppetru/alo-cluster/.gitea/workflows/deploy-nomad.yaml@master + with: + service_name: # Must match Nomad job ID + secrets: inherit +``` + +### 2. Add Secrets + +In Gitea → Your Repo → Settings → Actions → Secrets, add: + +| Secret | Value | +|--------|-------| +| `REGISTRY_USERNAME` | `ppetru` | +| `REGISTRY_PASSWORD` | Gitea access token with `packages:write` | +| `NOMAD_ADDR` | `http://nomad.service.consul:4646` | + +### 3. Push + +Push to `master` branch. The workflow will: +1. Build your Docker image with Nix +2. Push to Gitea registry +3. Update the Nomad job to trigger deployment +4. Monitor until deployment succeeds or fails + +## Workflow Options + +The shared workflow accepts these inputs: + +| Input | Default | Description | +|-------|---------|-------------| +| `service_name` | (required) | Nomad job ID | +| `flake_output` | `dockerImage` | Flake output to build | +| `registry` | `gitea.v.paler.net` | Container registry | + +Example with custom flake output: + +```yaml +jobs: + deploy: + uses: ppetru/alo-cluster/.gitea/workflows/deploy-nomad.yaml@master + with: + service_name: myservice + flake_output: packages.x86_64-linux.docker + secrets: inherit +``` + +## How It Works + +``` +Push to master + ↓ +Build: nix build .#dockerImage + ↓ +Push: skopeo → gitea.v.paler.net/ppetru/:latest + ↓ +Deploy: Update job meta.uuid → Nomad creates deployment + ↓ +Monitor: Poll deployment status for up to 5 minutes + ↓ +Success: Deployment healthy + OR +Failure: Nomad auto-reverts to previous version +``` + +## Troubleshooting + +### Build fails with KVM error + +``` +Required system: 'x86_64-linux' with features {kvm} +``` + +Use `extraCommands` instead of `runAsRoot` in your `docker.nix`: + +```nix +# Bad - requires KVM +runAsRoot = '' + mkdir -p /tmp +''; + +# Good - no KVM needed +extraCommands = '' + mkdir -p tmp + chmod 1777 tmp +''; +``` + +### No deployment created + +Ensure your Nomad job has the `update` stanza with `auto_revert = true`. + +### Image not updating + +Check that `force_pull = true` is set in the Nomad job's Docker config. + +### Deployment fails health checks + +- Check your `/healthz` endpoint works +- Increase `healthy_deadline` if startup is slow +- Check `nomad alloc logs ` for errors + +### Workflow can't access alo-cluster + +If Gitea can't pull the reusable workflow, you may need to make alo-cluster public or use a token. As a fallback, copy the workflow content directly. + +## Manual Deployment + +If CI fails, you can deploy manually: + +```bash +cd +nix build .#dockerImage +skopeo copy --dest-authfile ~/.docker/config.json \ + docker-archive:result \ + docker://gitea.v.paler.net/ppetru/:latest +nomad run /path/to/alo-cluster/services/.hcl +``` + +## Rollback + +Nomad auto-reverts on health check failure. For manual rollback: + +```bash +nomad job history # List versions +nomad job revert # Revert to specific version +``` diff --git a/nix-runner/README.md b/nix-runner/README.md new file mode 100644 index 0000000..d029aec --- /dev/null +++ b/nix-runner/README.md @@ -0,0 +1,100 @@ +# Nix Runner for Gitea Actions + +Custom Docker image for running Nix builds in CI. + +## What's Included + +- **Nix** with flakes enabled (`experimental-features = nix-command flakes`) +- **Node.js 20** for JavaScript-based GitHub Actions +- **Tools**: git, curl, jq, skopeo, bash, coreutils +- **Binary caches**: + - `c3.mule-stork.ts.net:8501` (local cache proxy) + - `cache.nixos.org` (official) + +## Usage + +In your workflow: + +```yaml +jobs: + build: + runs-on: nix + steps: + - uses: actions/checkout@v4 + - run: nix build .#myPackage +``` + +The `nix` label is configured in `services/act-runner.hcl`. + +## Current Version + +**Tag**: `v4` +**Image**: `gitea.v.paler.net/ppetru/nix-runner:v4` + +## Updating the Runner + +### 1. Edit `flake.nix` + +Make your changes, then bump the tag: + +```nix +tag = "v5"; # was v4 +``` + +### 2. Build + +```bash +cd nix-runner +nix build +``` + +### 3. Push to Registry + +```bash +skopeo copy --dest-authfile ~/.docker/config.json \ + docker-archive:result \ + docker://gitea.v.paler.net/ppetru/nix-runner:v5 +``` + +### 4. Update act-runner + +Edit `services/act-runner.hcl`: + +```hcl +GITEA_RUNNER_LABELS = "ubuntu-latest:docker://node:20-bookworm,nix:docker://gitea.v.paler.net/ppetru/nix-runner:v5" +``` + +### 5. Re-register Runner + +```bash +sudo rm /data/services/act-runner/.runner +nomad run services/act-runner.hcl +``` + +The runner will re-register with the new labels. + +## Configuration + +The image uses `NIX_CONFIG` environment variable for Nix settings: + +``` +experimental-features = nix-command flakes +sandbox = false +build-users-group = +substituters = http://c3.mule-stork.ts.net:8501 https://cache.nixos.org +trusted-public-keys = cache.nixos.org-1:... c3:... +``` + +## Troubleshooting + +### Build fails with `build-users-group` error + +The image runs as root without the nixbld group. This is handled by `build-users-group =` in NIX_CONFIG. + +### Can't fetch from cache + +Check that the runner container can reach `c3.mule-stork.ts.net:8501` (Tailscale network). + +### Missing tool + +Add it to `paths` in `flake.nix` and rebuild/push a new version.