209 lines
4.9 KiB
HCL
209 lines
4.9 KiB
HCL
job "prometheus" {
|
|
datacenters = ["alo"]
|
|
type = "service"
|
|
|
|
meta {
|
|
version = "2"
|
|
}
|
|
|
|
group "monitoring" {
|
|
count = 1
|
|
|
|
network {
|
|
port "http" {
|
|
#host_network = "tailscale"
|
|
static = "9090"
|
|
}
|
|
port "push" {
|
|
static = "9091"
|
|
}
|
|
}
|
|
|
|
task "prometheus" {
|
|
driver = "docker"
|
|
|
|
config {
|
|
image = "prom/prometheus:latest"
|
|
network_mode = "host"
|
|
args = [
|
|
"--storage.tsdb.path", "/opt/prometheus",
|
|
"--web.listen-address", "0.0.0.0:9090",
|
|
"--storage.tsdb.retention.time", "90d",
|
|
"--web.enable-admin-api"
|
|
]
|
|
force_pull = true
|
|
ports = ["http"]
|
|
volumes = [
|
|
"local/alerts.yml:/prometheus/alerts.yml",
|
|
"local/prometheus.yml:/prometheus/prometheus.yml",
|
|
"/data/compute/appdata/prometheus:/opt/prometheus",
|
|
]
|
|
}
|
|
|
|
service {
|
|
name = "prometheus"
|
|
port = "http"
|
|
tags = [
|
|
"traefik.enable=true",
|
|
"traefik.http.routers.prometheus.entryPoints=websecure",
|
|
"traefik.http.routers.prometheus.middlewares=authentik@file",
|
|
]
|
|
|
|
check {
|
|
type = "http"
|
|
path = "/-/healthy"
|
|
name = "http"
|
|
interval = "5s"
|
|
timeout = "2s"
|
|
}
|
|
}
|
|
|
|
# main configuration file
|
|
template {
|
|
data = <<EOH
|
|
#alerting:
|
|
# alertmanagers:
|
|
# - static_configs:
|
|
# - targets:
|
|
# - alertmanager.service.home:9093
|
|
|
|
rule_files:
|
|
- "alerts.yml"
|
|
|
|
scrape_configs:
|
|
- job_name: 'prometheus'
|
|
static_configs:
|
|
- targets: ['localhost:9090']
|
|
|
|
- job_name: 'pushgateway'
|
|
scrape_interval: 5s
|
|
honor_labels: true
|
|
consul_sd_configs:
|
|
- server: '{{ env "NOMAD_IP_http" }}:8500'
|
|
services: ['pushgateway']
|
|
scheme: http
|
|
|
|
- job_name: 'metrics'
|
|
scrape_interval: 5s
|
|
metrics_path: /metrics
|
|
consul_sd_configs:
|
|
- server: '{{ env "NOMAD_IP_http" }}:8500'
|
|
tags: ['metrics']
|
|
scheme: http
|
|
relabel_configs:
|
|
- source_labels: ['__meta_consul_dc']
|
|
target_label: 'dc'
|
|
- source_labels: [__meta_consul_service]
|
|
target_label: 'job'
|
|
- source_labels: ['__meta_consul_node']
|
|
target_label: 'host'
|
|
- source_labels: ['__meta_consul_tags']
|
|
target_label: 'tags'
|
|
- source_labels: ['__meta_consul_tags']
|
|
regex: '.*job-(.*?)(,.*)'
|
|
replacement: '${1}'
|
|
target_label: 'job_name'
|
|
metric_relabel_configs:
|
|
- source_labels: [__name__]
|
|
# also large (but useful): traefik_entrypoint_request_duration_seconds_bucket
|
|
regex: '(redis_commands_latencies_usec_bucket|vector_component_received_events_count_bucket)'
|
|
action: drop
|
|
|
|
- job_name: 'consul-server'
|
|
scrape_interval: 10s
|
|
metrics_path: /v1/agent/metrics
|
|
honor_labels: true
|
|
params:
|
|
format: ['prometheus']
|
|
consul_sd_configs:
|
|
- server: '{{ env "NOMAD_IP_http" }}:8500'
|
|
services: ['nomad-client']
|
|
scheme: http
|
|
relabel_configs:
|
|
- source_labels: ['__meta_consul_dc']
|
|
target_label: 'dc'
|
|
- source_labels: ['__meta_consul_node']
|
|
target_label: 'host'
|
|
- source_labels: ['__meta_consul_tags']
|
|
target_label: 'tags'
|
|
- source_labels: [__address__]
|
|
action: replace
|
|
regex: ([^:]+):.*
|
|
replacement: $1:8500
|
|
target_label: __address__
|
|
|
|
- job_name: 'nomad'
|
|
consul_sd_configs:
|
|
- server: '{{ env "NOMAD_IP_http" }}:8500'
|
|
services: ['nomad-client']
|
|
tags: ['http']
|
|
scheme: http
|
|
scrape_interval: 10s
|
|
metrics_path: /v1/metrics
|
|
params:
|
|
format: ['prometheus']
|
|
relabel_configs:
|
|
- source_labels: ['__meta_consul_dc']
|
|
target_label: 'dc'
|
|
- source_labels: [__meta_consul_service]
|
|
target_label: 'job'
|
|
- source_labels: ['__meta_consul_node']
|
|
target_label: 'host'
|
|
|
|
- job_name: 'wordpress'
|
|
static_configs:
|
|
- targets:
|
|
- 'wordpress.paler.net'
|
|
scrape_interval: 15s
|
|
metrics_path: /wp-json/prompress/v1/metrics
|
|
|
|
EOH
|
|
|
|
destination = "local/prometheus.yml"
|
|
change_mode = "signal"
|
|
change_signal = "SIGHUP"
|
|
env = false
|
|
}
|
|
|
|
template {
|
|
change_mode = "noop"
|
|
destination = "local/alerts.yml"
|
|
left_delimiter = "[["
|
|
right_delimiter = "]]"
|
|
data = <<EOH
|
|
---
|
|
groups:
|
|
- name: prometheus_alerts
|
|
rules:
|
|
- alert: Traefik Down
|
|
expr: absent(nomad_client_allocs_cpu_user{task="traefik"})
|
|
for: 2m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: "Traefik is down."
|
|
EOH
|
|
}
|
|
|
|
resources {
|
|
cpu = 1000
|
|
memory = 512
|
|
}
|
|
}
|
|
|
|
task "pushgateway" {
|
|
driver = "docker"
|
|
|
|
config {
|
|
image = "prom/pushgateway:latest"
|
|
ports = ["push"]
|
|
}
|
|
|
|
service {
|
|
name = "pushgateway"
|
|
port = "push"
|
|
}
|
|
}
|
|
}
|
|
}
|