Files
alo-cluster/services/prometheus.hcl
2023-08-14 14:58:28 +01:00

167 lines
3.9 KiB
HCL

job "prometheus" {
datacenters = ["alo"]
type = "service"
meta {
version = "2"
}
group "monitoring" {
count = 1
network {
port "http" {
#host_network = "tailscale"
static = "9090"
}
}
task "prometheus" {
driver = "docker"
service {
name = "prometheus"
port = "http"
tags = [
"traefik.enable=true",
"traefik.http.routers.prometheus.entryPoints=websecure",
]
check {
type = "http"
path = "/-/healthy"
name = "http"
interval = "5s"
timeout = "2s"
}
}
# main configuration file
template {
data = <<EOH
#alerting:
# alertmanagers:
# - static_configs:
# - targets:
# - alertmanager.service.home:9093
rule_files:
- "alerts.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'metrics'
scrape_interval: 5s
metrics_path: /metrics
consul_sd_configs:
- server: '{{ env "NOMAD_IP_http" }}:8500'
tags: ['metrics']
scheme: http
relabel_configs:
- source_labels: ['__meta_consul_dc']
target_label: 'dc'
- source_labels: [__meta_consul_service]
target_label: 'job'
- source_labels: ['__meta_consul_node']
target_label: 'host'
- source_labels: ['__meta_consul_tags']
target_label: 'tags'
- source_labels: ['__meta_consul_tags']
regex: '.*job-(.*?)(,.*)'
replacement: '${1}'
target_label: 'job_name'
- job_name: 'consul-server'
scrape_interval: 10s
metrics_path: /v1/agent/metrics
honor_labels: true
params:
format: ['prometheus']
consul_sd_configs:
- server: '{{ env "NOMAD_IP_http" }}:8500'
services: ['nomad-client']
scheme: http
relabel_configs:
- source_labels: ['__meta_consul_dc']
target_label: 'dc'
- source_labels: ['__meta_consul_node']
target_label: 'host'
- source_labels: ['__meta_consul_tags']
target_label: 'tags'
- source_labels: [__address__]
action: replace
regex: ([^:]+):.*
replacement: $1:8500
target_label: __address__
- job_name: 'nomad'
consul_sd_configs:
- server: '{{ env "NOMAD_IP_http" }}:8500'
services: ['nomad-client']
tags: ['http']
scheme: http
scrape_interval: 10s
metrics_path: /v1/metrics
params:
format: ['prometheus']
relabel_configs:
- source_labels: ['__meta_consul_dc']
target_label: 'dc'
- source_labels: [__meta_consul_service]
target_label: 'job'
- source_labels: ['__meta_consul_node']
target_label: 'host'
EOH
destination = "local/prometheus.yml"
change_mode = "signal"
change_signal = "SIGHUP"
env = false
}
template {
change_mode = "noop"
destination = "local/alerts.yml"
left_delimiter = "[["
right_delimiter = "]]"
data = <<EOH
---
groups:
- name: prometheus_alerts
rules:
- alert: Traefik Down
expr: absent(nomad_client_allocs_cpu_user{task="traefik"})
for: 2m
labels:
severity: page
annotations:
description: "Traefik is down."
EOH
}
config {
image = "prom/prometheus:v2.44.0"
network_mode = "host"
args = ["--storage.tsdb.path", "/opt/prometheus", "--web.listen-address", "0.0.0.0:9090", "--storage.tsdb.retention.time", "900d"]
force_pull = true
ports = ["http"]
volumes = [
"local/alerts.yml:/prometheus/alerts.yml",
"local/prometheus.yml:/prometheus/prometheus.yml",
"/data/compute/appdata/prometheus:/opt/prometheus",
]
}
resources {
cpu = 1000
memory = 512
}
}
}
}