Add reusable CI/CD workflow and documentation
- .gitea/workflows/deploy-nomad.yaml: Shared workflow for build/push/deploy - docs/CICD_SETUP.md: Guide for adding CI/CD to new services - nix-runner/README.md: Document the custom Nix runner image Services can now use a 10-line workflow that calls the shared one: uses: ppetru/alo-cluster/.gitea/workflows/deploy-nomad.yaml@master 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
96
.gitea/workflows/deploy-nomad.yaml
Normal file
96
.gitea/workflows/deploy-nomad.yaml
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
# ABOUTME: Reusable workflow for building Nix Docker images and deploying to Nomad.
|
||||||
|
# ABOUTME: Called by service repos with: uses: ppetru/alo-cluster/.gitea/workflows/deploy-nomad.yaml@master
|
||||||
|
|
||||||
|
name: Deploy to Nomad
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
inputs:
|
||||||
|
service_name:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
description: "Nomad job name (must match job ID in services/*.hcl)"
|
||||||
|
flake_output:
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
default: "dockerImage"
|
||||||
|
description: "Flake output to build (default: dockerImage)"
|
||||||
|
registry:
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
default: "gitea.v.paler.net"
|
||||||
|
description: "Container registry hostname"
|
||||||
|
secrets:
|
||||||
|
REGISTRY_USERNAME:
|
||||||
|
required: true
|
||||||
|
REGISTRY_PASSWORD:
|
||||||
|
required: true
|
||||||
|
NOMAD_ADDR:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-and-deploy:
|
||||||
|
runs-on: nix
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Build Docker image
|
||||||
|
run: |
|
||||||
|
echo "Building .#${{ inputs.flake_output }}..."
|
||||||
|
nix build ".#${{ inputs.flake_output }}" --out-link result
|
||||||
|
|
||||||
|
- name: Push to registry
|
||||||
|
run: |
|
||||||
|
echo "Pushing to ${{ inputs.registry }}/ppetru/${{ inputs.service_name }}:latest..."
|
||||||
|
skopeo copy \
|
||||||
|
--dest-creds "${{ secrets.REGISTRY_USERNAME }}:${{ secrets.REGISTRY_PASSWORD }}" \
|
||||||
|
--insecure-policy \
|
||||||
|
docker-archive:result \
|
||||||
|
"docker://${{ inputs.registry }}/ppetru/${{ inputs.service_name }}:latest"
|
||||||
|
|
||||||
|
- name: Deploy to Nomad
|
||||||
|
env:
|
||||||
|
NOMAD_ADDR: ${{ secrets.NOMAD_ADDR }}
|
||||||
|
SERVICE: ${{ inputs.service_name }}
|
||||||
|
run: |
|
||||||
|
echo "Deploying $SERVICE to Nomad..."
|
||||||
|
|
||||||
|
# Fetch current job, update UUID to force deployment
|
||||||
|
JOB=$(curl -sS "$NOMAD_ADDR/v1/job/$SERVICE")
|
||||||
|
NEW_UUID=$(cat /proc/sys/kernel/random/uuid)
|
||||||
|
echo "New deployment UUID: $NEW_UUID"
|
||||||
|
UPDATED_JOB=$(echo "$JOB" | jq --arg uuid "$NEW_UUID" '.Meta.uuid = $uuid')
|
||||||
|
|
||||||
|
# Submit updated job
|
||||||
|
RESULT=$(echo "{\"Job\": $UPDATED_JOB}" | curl -sS -X POST "$NOMAD_ADDR/v1/jobs" \
|
||||||
|
-H "Content-Type: application/json" -d @-)
|
||||||
|
echo "Submit result: $RESULT"
|
||||||
|
|
||||||
|
# Monitor deployment
|
||||||
|
sleep 3
|
||||||
|
DEPLOY_ID=$(curl -sS "$NOMAD_ADDR/v1/job/$SERVICE/deployments" | jq -r '.[0].ID')
|
||||||
|
echo "Deployment ID: $DEPLOY_ID"
|
||||||
|
|
||||||
|
if [ "$DEPLOY_ID" = "null" ]; then
|
||||||
|
echo "ERROR: No deployment created. Ensure job has 'update' stanza with 'auto_revert = true'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Monitoring deployment..."
|
||||||
|
for i in $(seq 1 30); do
|
||||||
|
STATUS=$(curl -sS "$NOMAD_ADDR/v1/deployment/$DEPLOY_ID" | jq -r '.Status')
|
||||||
|
echo "[$i/30] Deployment status: $STATUS"
|
||||||
|
case $STATUS in
|
||||||
|
successful)
|
||||||
|
echo "Deployment successful!"
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
failed|cancelled)
|
||||||
|
echo "Deployment failed or cancelled"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
sleep 10
|
||||||
|
done
|
||||||
|
echo "Timeout waiting for deployment"
|
||||||
|
exit 1
|
||||||
206
docs/CICD_SETUP.md
Normal file
206
docs/CICD_SETUP.md
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
# CI/CD Setup for Nomad Services
|
||||||
|
|
||||||
|
Guide for adding automated builds and deployments to a service.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
### 1. Service Repository
|
||||||
|
|
||||||
|
Your service needs a `flake.nix` that exports a Docker image:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
{
|
||||||
|
outputs = { self, nixpkgs, ... }: {
|
||||||
|
# The workflow looks for this output by default
|
||||||
|
dockerImage = pkgs.dockerTools.buildImage {
|
||||||
|
name = "gitea.v.paler.net/ppetru/<service>";
|
||||||
|
tag = "latest";
|
||||||
|
# ... image config
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Important**: Use `extraCommands` instead of `runAsRoot` in your Docker build - the CI runner doesn't have KVM.
|
||||||
|
|
||||||
|
### 2. Nomad Job
|
||||||
|
|
||||||
|
Your job in `services/<name>.hcl` needs:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
job "<service>" {
|
||||||
|
# Required: UUID changes trigger deployments
|
||||||
|
meta {
|
||||||
|
uuid = uuidv4()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Required: enables deployment tracking and auto-rollback
|
||||||
|
update {
|
||||||
|
max_parallel = 1
|
||||||
|
health_check = "checks"
|
||||||
|
min_healthy_time = "30s"
|
||||||
|
healthy_deadline = "5m"
|
||||||
|
auto_revert = true
|
||||||
|
}
|
||||||
|
|
||||||
|
# Required: pulls new image on each deployment
|
||||||
|
task "app" {
|
||||||
|
config {
|
||||||
|
force_pull = true
|
||||||
|
}
|
||||||
|
|
||||||
|
# Recommended: health check for deployment validation
|
||||||
|
service {
|
||||||
|
check {
|
||||||
|
type = "http"
|
||||||
|
path = "/healthz"
|
||||||
|
interval = "10s"
|
||||||
|
timeout = "5s"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Create Workflow
|
||||||
|
|
||||||
|
Add `.gitea/workflows/deploy.yaml` to your service repo:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
name: Deploy
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [master]
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
deploy:
|
||||||
|
uses: ppetru/alo-cluster/.gitea/workflows/deploy-nomad.yaml@master
|
||||||
|
with:
|
||||||
|
service_name: <your-service> # Must match Nomad job ID
|
||||||
|
secrets: inherit
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Add Secrets
|
||||||
|
|
||||||
|
In Gitea → Your Repo → Settings → Actions → Secrets, add:
|
||||||
|
|
||||||
|
| Secret | Value |
|
||||||
|
|--------|-------|
|
||||||
|
| `REGISTRY_USERNAME` | `ppetru` |
|
||||||
|
| `REGISTRY_PASSWORD` | Gitea access token with `packages:write` |
|
||||||
|
| `NOMAD_ADDR` | `http://nomad.service.consul:4646` |
|
||||||
|
|
||||||
|
### 3. Push
|
||||||
|
|
||||||
|
Push to `master` branch. The workflow will:
|
||||||
|
1. Build your Docker image with Nix
|
||||||
|
2. Push to Gitea registry
|
||||||
|
3. Update the Nomad job to trigger deployment
|
||||||
|
4. Monitor until deployment succeeds or fails
|
||||||
|
|
||||||
|
## Workflow Options
|
||||||
|
|
||||||
|
The shared workflow accepts these inputs:
|
||||||
|
|
||||||
|
| Input | Default | Description |
|
||||||
|
|-------|---------|-------------|
|
||||||
|
| `service_name` | (required) | Nomad job ID |
|
||||||
|
| `flake_output` | `dockerImage` | Flake output to build |
|
||||||
|
| `registry` | `gitea.v.paler.net` | Container registry |
|
||||||
|
|
||||||
|
Example with custom flake output:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
jobs:
|
||||||
|
deploy:
|
||||||
|
uses: ppetru/alo-cluster/.gitea/workflows/deploy-nomad.yaml@master
|
||||||
|
with:
|
||||||
|
service_name: myservice
|
||||||
|
flake_output: packages.x86_64-linux.docker
|
||||||
|
secrets: inherit
|
||||||
|
```
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
```
|
||||||
|
Push to master
|
||||||
|
↓
|
||||||
|
Build: nix build .#dockerImage
|
||||||
|
↓
|
||||||
|
Push: skopeo → gitea.v.paler.net/ppetru/<service>:latest
|
||||||
|
↓
|
||||||
|
Deploy: Update job meta.uuid → Nomad creates deployment
|
||||||
|
↓
|
||||||
|
Monitor: Poll deployment status for up to 5 minutes
|
||||||
|
↓
|
||||||
|
Success: Deployment healthy
|
||||||
|
OR
|
||||||
|
Failure: Nomad auto-reverts to previous version
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Build fails with KVM error
|
||||||
|
|
||||||
|
```
|
||||||
|
Required system: 'x86_64-linux' with features {kvm}
|
||||||
|
```
|
||||||
|
|
||||||
|
Use `extraCommands` instead of `runAsRoot` in your `docker.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# Bad - requires KVM
|
||||||
|
runAsRoot = ''
|
||||||
|
mkdir -p /tmp
|
||||||
|
'';
|
||||||
|
|
||||||
|
# Good - no KVM needed
|
||||||
|
extraCommands = ''
|
||||||
|
mkdir -p tmp
|
||||||
|
chmod 1777 tmp
|
||||||
|
'';
|
||||||
|
```
|
||||||
|
|
||||||
|
### No deployment created
|
||||||
|
|
||||||
|
Ensure your Nomad job has the `update` stanza with `auto_revert = true`.
|
||||||
|
|
||||||
|
### Image not updating
|
||||||
|
|
||||||
|
Check that `force_pull = true` is set in the Nomad job's Docker config.
|
||||||
|
|
||||||
|
### Deployment fails health checks
|
||||||
|
|
||||||
|
- Check your `/healthz` endpoint works
|
||||||
|
- Increase `healthy_deadline` if startup is slow
|
||||||
|
- Check `nomad alloc logs <alloc-id>` for errors
|
||||||
|
|
||||||
|
### Workflow can't access alo-cluster
|
||||||
|
|
||||||
|
If Gitea can't pull the reusable workflow, you may need to make alo-cluster public or use a token. As a fallback, copy the workflow content directly.
|
||||||
|
|
||||||
|
## Manual Deployment
|
||||||
|
|
||||||
|
If CI fails, you can deploy manually:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd <service-repo>
|
||||||
|
nix build .#dockerImage
|
||||||
|
skopeo copy --dest-authfile ~/.docker/config.json \
|
||||||
|
docker-archive:result \
|
||||||
|
docker://gitea.v.paler.net/ppetru/<service>:latest
|
||||||
|
nomad run /path/to/alo-cluster/services/<service>.hcl
|
||||||
|
```
|
||||||
|
|
||||||
|
## Rollback
|
||||||
|
|
||||||
|
Nomad auto-reverts on health check failure. For manual rollback:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nomad job history <service> # List versions
|
||||||
|
nomad job revert <service> <version> # Revert to specific version
|
||||||
|
```
|
||||||
100
nix-runner/README.md
Normal file
100
nix-runner/README.md
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
# Nix Runner for Gitea Actions
|
||||||
|
|
||||||
|
Custom Docker image for running Nix builds in CI.
|
||||||
|
|
||||||
|
## What's Included
|
||||||
|
|
||||||
|
- **Nix** with flakes enabled (`experimental-features = nix-command flakes`)
|
||||||
|
- **Node.js 20** for JavaScript-based GitHub Actions
|
||||||
|
- **Tools**: git, curl, jq, skopeo, bash, coreutils
|
||||||
|
- **Binary caches**:
|
||||||
|
- `c3.mule-stork.ts.net:8501` (local cache proxy)
|
||||||
|
- `cache.nixos.org` (official)
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
In your workflow:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: nix
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- run: nix build .#myPackage
|
||||||
|
```
|
||||||
|
|
||||||
|
The `nix` label is configured in `services/act-runner.hcl`.
|
||||||
|
|
||||||
|
## Current Version
|
||||||
|
|
||||||
|
**Tag**: `v4`
|
||||||
|
**Image**: `gitea.v.paler.net/ppetru/nix-runner:v4`
|
||||||
|
|
||||||
|
## Updating the Runner
|
||||||
|
|
||||||
|
### 1. Edit `flake.nix`
|
||||||
|
|
||||||
|
Make your changes, then bump the tag:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
tag = "v5"; # was v4
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Build
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd nix-runner
|
||||||
|
nix build
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Push to Registry
|
||||||
|
|
||||||
|
```bash
|
||||||
|
skopeo copy --dest-authfile ~/.docker/config.json \
|
||||||
|
docker-archive:result \
|
||||||
|
docker://gitea.v.paler.net/ppetru/nix-runner:v5
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Update act-runner
|
||||||
|
|
||||||
|
Edit `services/act-runner.hcl`:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
GITEA_RUNNER_LABELS = "ubuntu-latest:docker://node:20-bookworm,nix:docker://gitea.v.paler.net/ppetru/nix-runner:v5"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Re-register Runner
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo rm /data/services/act-runner/.runner
|
||||||
|
nomad run services/act-runner.hcl
|
||||||
|
```
|
||||||
|
|
||||||
|
The runner will re-register with the new labels.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
The image uses `NIX_CONFIG` environment variable for Nix settings:
|
||||||
|
|
||||||
|
```
|
||||||
|
experimental-features = nix-command flakes
|
||||||
|
sandbox = false
|
||||||
|
build-users-group =
|
||||||
|
substituters = http://c3.mule-stork.ts.net:8501 https://cache.nixos.org
|
||||||
|
trusted-public-keys = cache.nixos.org-1:... c3:...
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Build fails with `build-users-group` error
|
||||||
|
|
||||||
|
The image runs as root without the nixbld group. This is handled by `build-users-group =` in NIX_CONFIG.
|
||||||
|
|
||||||
|
### Can't fetch from cache
|
||||||
|
|
||||||
|
Check that the runner container can reach `c3.mule-stork.ts.net:8501` (Tailscale network).
|
||||||
|
|
||||||
|
### Missing tool
|
||||||
|
|
||||||
|
Add it to `paths` in `flake.nix` and rebuild/push a new version.
|
||||||
Reference in New Issue
Block a user