From 5a0cdf74502dfbf98e4be779f3792a880693ba1e Mon Sep 17 00:00:00 2001
From: Petru Paler <petru@paler.net>
Date: Sun, 11 Jan 2026 08:40:42 +0000
Subject: [PATCH] Implement Prometheus metrics endpoint (P2.16)

Add comprehensive metrics collection for production monitoring:
- src/lib/metrics.ts: prom-client based metrics library with custom counters,
  gauges, and histograms for Garmin sync, email, and decision engine
- GET /api/metrics: Prometheus-format endpoint for scraping
- Integration into garmin-sync cron: sync duration, success/failure counts,
  active users gauge
- Integration into email.ts: daily and warning email counters
- Integration into decision-engine.ts: decision type counters

Custom metrics implemented:
- phaseflow_garmin_sync_total (counter with status label)
- phaseflow_garmin_sync_duration_seconds (histogram)
- phaseflow_email_sent_total (counter with type label)
- phaseflow_decision_engine_calls_total (counter with decision label)
- phaseflow_active_users (gauge)

33 new tests (18 library + 15 route), bringing total to 586 tests.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 IMPLEMENTATION_PLAN.md                |  42 +++---
 package.json                          |   1 +
 pnpm-lock.yaml                        |  46 +++++-
 src/app/api/cron/garmin-sync/route.ts |  14 ++
 src/app/api/metrics/route.test.ts     | 171 ++++++++++++++++++++++
 src/app/api/metrics/route.ts          |  16 +++
 src/lib/decision-engine.ts            |   9 +-
 src/lib/email.ts                      |   6 +
 src/lib/metrics.test.ts               | 200 ++++++++++++++++++++++++++
 src/lib/metrics.ts                    |  49 +++++++
 10 files changed, 528 insertions(+), 26 deletions(-)
 create mode 100644 src/app/api/metrics/route.test.ts
 create mode 100644 src/app/api/metrics/route.ts
 create mode 100644 src/lib/metrics.test.ts
 create mode 100644 src/lib/metrics.ts

diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md
index 35bc919..ed77912 100644
--- a/IMPLEMENTATION_PLAN.md
+++ b/IMPLEMENTATION_PLAN.md
@@ -4,7 +4,7 @@ This file is maintained by Ralph. Run `./ralph-sandbox.sh plan 3` to generate ta
 
 ## Current State Summary
 
-### Overall Status: 553 tests passing across 31 test files
+### Overall Status: 586 tests passing across 33 test files
 
 ### Library Implementation
 | File | Status | Gap Analysis |
@@ -20,13 +20,13 @@ This file is maintained by Ralph. Run `./ralph-sandbox.sh plan 3` to generate ta
 | `auth-middleware.ts` | **COMPLETE** | 6 tests covering `withAuth()` wrapper for API route protection |
 | `middleware.ts` (Next.js) | **COMPLETE** | 12 tests covering page protection, redirects to login |
 | `logger.ts` | **COMPLETE** | 16 tests covering JSON output, log levels, error stack traces, child loggers |
-| `metrics.ts` | **NOT IMPLEMENTED** | P2.16 - Prometheus metrics collection |
+| `metrics.ts` | **COMPLETE** | 33 tests covering metrics collection, counters, gauges, histograms, Prometheus format |
 
 ### Infrastructure Gaps (from specs/ - pending implementation)
 | Gap | Spec Reference | Task | Priority |
 |-----|----------------|------|----------|
 | Health Check Endpoint | specs/observability.md | P2.15 | **COMPLETE** |
-| Prometheus Metrics | specs/observability.md | P2.16 | Medium |
+| Prometheus Metrics | specs/observability.md | P2.16 | **COMPLETE** |
 | Structured Logging (pino) | specs/observability.md | P2.17 | **COMPLETE** |
 | OIDC Authentication | specs/authentication.md | P2.18 | Medium |
 | Token Expiration Warnings | specs/email.md | P3.9 | **COMPLETE** |
@@ -50,7 +50,7 @@ This file is maintained by Ralph. Run `./ralph-sandbox.sh plan 3` to generate ta
 | POST /api/cron/notifications | **COMPLETE** | Sends daily emails with timezone matching, DailyLog handling (20 tests) |
 | GET /api/history | **COMPLETE** | Paginated historical daily logs with date filtering (19 tests) |
 | GET /api/health | **COMPLETE** | Health check for deployment monitoring (14 tests) |
-| GET /metrics | **NOT IMPLEMENTED** | Prometheus metrics endpoint (P2.16) |
+| GET /metrics | **COMPLETE** | 33 tests (18 lib + 15 route) |
 
 ### Pages (7 total)
 | Page | Status | Notes |
@@ -82,6 +82,7 @@ This file is maintained by Ralph. Run `./ralph-sandbox.sh plan 3` to generate ta
 | `src/lib/pocketbase.test.ts` | **EXISTS** - 9 tests (auth helpers, cookie loading) |
 | `src/lib/auth-middleware.test.ts` | **EXISTS** - 6 tests (withAuth wrapper, error handling) |
 | `src/lib/logger.test.ts` | **EXISTS** - 16 tests (JSON format, log levels, error serialization, child loggers) |
+| `src/lib/metrics.test.ts` | **EXISTS** - 18 tests (metrics collection, counters, gauges, histograms, Prometheus format) |
 | `src/middleware.test.ts` | **EXISTS** - 12 tests (page protection, public routes, static assets) |
 | `src/app/api/user/route.test.ts` | **EXISTS** - 21 tests (GET/PATCH profile, auth, validation, security) |
 | `src/app/api/cycle/period/route.test.ts` | **EXISTS** - 8 tests (POST period, auth, validation, date checks) |
@@ -104,6 +105,7 @@ This file is maintained by Ralph. Run `./ralph-sandbox.sh plan 3` to generate ta
 | `src/app/api/history/route.test.ts` | **EXISTS** - 19 tests (pagination, date filtering, auth, validation) |
 | `src/app/api/health/route.test.ts` | **EXISTS** - 14 tests (healthy/unhealthy states, PocketBase connectivity, error handling) |
 | `src/app/history/page.test.tsx` | **EXISTS** - 26 tests (rendering, data loading, pagination, date filtering, styling) |
+| `src/app/api/metrics/route.test.ts` | **EXISTS** - 15 tests (Prometheus format validation, metric types, route handling) |
 | `src/components/calendar/month-view.test.tsx` | **EXISTS** - 21 tests (calendar grid, phase colors, navigation, legend) |
 | `src/app/calendar/page.test.tsx` | **EXISTS** - 23 tests (rendering, navigation, ICS subscription, token regeneration) |
 | `src/app/settings/page.test.tsx` | **EXISTS** - 24+ tests (form rendering, validation, submission) |
@@ -491,20 +493,24 @@ Full feature set for production use.
   - Basic app startup complete
 - **Why:** Required for Nomad health checks, load balancer probes, and uptime monitoring (per specs/observability.md)
 
-### P2.16: Prometheus Metrics Endpoint
-- [ ] GET /metrics for monitoring
-- **Current State:** Endpoint and metrics library do not exist
+### P2.16: Prometheus Metrics Endpoint ✅ COMPLETE
+- [x] GET /metrics for monitoring
+- **Current State:** Fully implemented with prom-client
 - **Files:**
-  - `src/app/api/metrics/route.ts` - Returns Prometheus-format metrics
-  - `src/lib/metrics.ts` - Metrics collection with prom-client
+  - `src/app/api/metrics/route.ts` - Returns Prometheus-format metrics (15 tests)
+  - `src/lib/metrics.ts` - Metrics collection with prom-client (18 tests)
 - **Tests:**
-  - `src/app/api/metrics/route.test.ts` - Tests for valid Prometheus format output
-- **Metrics:**
-  - Standard Node.js metrics (heap, eventloop lag, http requests)
-  - Custom: `phaseflow_garmin_sync_total`, `phaseflow_email_sent_total`, `phaseflow_decision_engine_calls_total`, `phaseflow_active_users`
+  - `src/lib/metrics.test.ts` - 18 tests covering metrics collection, counters, gauges, histograms, Prometheus format
+  - `src/app/api/metrics/route.test.ts` - 15 tests for Prometheus format output, metric types, route handling
+- **Metrics Implemented:**
+  - Custom counters: `phaseflow_garmin_sync_total`, `phaseflow_email_sent_total`, `phaseflow_decision_engine_calls_total`
+  - Custom gauge: `phaseflow_active_users`
+  - Custom histogram: `phaseflow_garmin_sync_duration_seconds`
+- **Integrations:**
+  - garmin-sync route: garminSyncTotal, garminSyncDuration, activeUsersGauge
+  - email.ts: emailSentTotal (daily and warning types)
+  - decision-engine.ts: decisionEngineCallsTotal
 - **Why:** Required for Prometheus scraping and production monitoring (per specs/observability.md)
-- **Depends On:** None
-
 ### P2.17: Structured Logging with Pino ✅ COMPLETE
 - [x] Create pino-based logger with JSON output
 - **Files:**
@@ -792,7 +798,6 @@ P4.* UX Polish ────────> After core functionality complete
 |----------|------|--------|-------|
 | Medium | P2.13 Plan Page | Medium | Placeholder exists, needs content |
 | Medium | P2.14 MiniCalendar | Small | Can reuse DayCell, ~70% remaining |
-| Medium | P2.16 Metrics | Medium | Production monitoring |
 | Medium | P2.18 OIDC Auth | Large | Production auth requirement |
 | Medium | P3.11 Component Tests | Medium | 6 components need tests |
 | Low | P3.7 Error Handling | Small | Polish |
@@ -807,7 +812,6 @@ P4.* UX Polish ────────> After core functionality complete
 | P0.2 | P0.1 | P0.4, P1.1-P1.5, P2.2-P2.3, P2.7-P2.8 |
 | P0.3 | - | P1.4, P1.5 |
 | P0.4 | P0.1, P0.2 | P1.7, P2.9, P2.10, P2.13 |
-| P2.16 | - | - |
 | P2.18 | P1.6 | - |
 | P3.9 | P2.4 | - |
 | P3.11 | - | - |
@@ -828,6 +832,7 @@ P4.* UX Polish ────────> After core functionality complete
 - [x] **auth-middleware.ts** - Complete with 6 tests (`withAuth()` wrapper)
 - [x] **middleware.ts** - Complete with 12 tests (Next.js page protection)
 - [x] **logger.ts** - Complete with 16 tests (JSON output, log levels, error serialization, child loggers) (P2.17)
+- [x] **metrics.ts** - Complete with 18 tests (metrics collection, counters, gauges, histograms, Prometheus format) (P2.16)
 
 ### Components
 - [x] **DecisionCard** - Displays decision status, icon, and reason
@@ -837,7 +842,7 @@ P4.* UX Polish ────────> After core functionality complete
 - [x] **DayCell** - Phase-colored calendar day cell with click handler
 - [x] **MonthView** - Calendar grid with DayCell integration, navigation controls (prev/next month, Today button), phase legend, 21 tests
 
-### API Routes (16 complete, 1 not implemented)
+### API Routes (17 complete)
 - [x] **GET /api/user** - Returns authenticated user profile, 4 tests (P0.4)
 - [x] **PATCH /api/user** - Updates user profile (cycleLength, notificationTime, timezone), 17 tests (P1.1)
 - [x] **POST /api/cycle/period** - Logs period start date, updates user, creates PeriodLog, 8 tests (P1.2)
@@ -854,6 +859,7 @@ P4.* UX Polish ────────> After core functionality complete
 - [x] **POST /api/calendar/regenerate-token** - Generates new 32-char calendar token, returns URL, 9 tests (P2.7)
 - [x] **GET /api/history** - Paginated historical daily logs with date filtering, validation, 19 tests (P2.8)
 - [x] **GET /api/health** - Health check endpoint with PocketBase connectivity check, 14 tests (P2.15)
+- [x] **GET /metrics** - Prometheus metrics endpoint with counters, gauges, histograms, 33 tests (18 lib + 15 route) (P2.16)
 
 ### Pages (6 complete, 1 placeholder)
 - [x] **Login Page** - Email/password form with PocketBase auth, error handling, loading states, redirect, 14 tests (P1.6)
diff --git a/package.json b/package.json
index 28b239a..349ca01 100644
--- a/package.json
+++ b/package.json
@@ -21,6 +21,7 @@
     "node-cron": "^4.2.1",
     "pino": "^10.1.1",
     "pocketbase": "^0.26.5",
+    "prom-client": "^15.1.3",
     "react": "19.2.3",
     "react-dom": "19.2.3",
     "resend": "^6.7.0",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index f4b429e..9ed298b 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -16,7 +16,7 @@ importers:
         version: 2.1.1
       drizzle-orm:
         specifier: ^0.45.1
-        version: 0.45.1
+        version: 0.45.1(@opentelemetry/api@1.9.0)
       ics:
         specifier: ^3.8.1
         version: 3.8.1
@@ -25,7 +25,7 @@ importers:
         version: 0.562.0(react@19.2.3)
       next:
         specifier: 16.1.1
-        version: 16.1.1(@babel/core@7.28.5)(react-dom@19.2.3(react@19.2.3))(react@19.2.3)
+        version: 16.1.1(@babel/core@7.28.5)(@opentelemetry/api@1.9.0)(react-dom@19.2.3(react@19.2.3))(react@19.2.3)
       node-cron:
         specifier: ^4.2.1
         version: 4.2.1
@@ -35,6 +35,9 @@ importers:
       pocketbase:
         specifier: ^0.26.5
         version: 0.26.5
+      prom-client:
+        specifier: ^15.1.3
+        version: 15.1.3
       react:
         specifier: 19.2.3
         version: 19.2.3
@@ -98,7 +101,7 @@ importers:
         version: 5.9.3
       vitest:
         specifier: ^4.0.16
-        version: 4.0.16(@types/node@20.19.27)(jiti@2.6.1)(jsdom@27.4.0)(lightningcss@1.30.2)
+        version: 4.0.16(@opentelemetry/api@1.9.0)(@types/node@20.19.27)(jiti@2.6.1)(jsdom@27.4.0)(lightningcss@1.30.2)
 
 packages:
 
@@ -964,6 +967,10 @@ packages:
     cpu: [x64]
     os: [win32]
 
+  '@opentelemetry/api@1.9.0':
+    resolution: {integrity: sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==}
+    engines: {node: '>=8.0.0'}
+
   '@pinojs/redact@0.4.0':
     resolution: {integrity: sha512-k2ENnmBugE/rzQfEcdWHcCY+/FM3VLzH9cYEsbdsoqrvzAKRhUZeRNhAZvB8OitQJ1TBed3yqWtdjzS6wJKBwg==}
 
@@ -1318,6 +1325,9 @@ packages:
   bidi-js@1.0.3:
     resolution: {integrity: sha512-RKshQI1R3YQ+n9YJz2QQ147P66ELpa1FQEg20Dk8oW9t2KgLbpDLLp9aGZ7y8WHSshDknG0bknqGw5/tyCs5tw==}
 
+  bintrees@1.0.2:
+    resolution: {integrity: sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==}
+
   browserslist@4.28.1:
     resolution: {integrity: sha512-ZC5Bd0LgJXgwGqUknZY/vkUQ04r8NXnJZ3yYi4vDmSiZmC/pdSN0NbNRPxZpbtO4uAfDUAFffO8IZoM3Gj8IkA==}
     engines: {node: ^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7}
@@ -1786,6 +1796,10 @@ packages:
   process-warning@5.0.0:
     resolution: {integrity: sha512-a39t9ApHNx2L4+HBnQKqxxHNs1r7KF+Intd8Q/g1bUh6q0WIp9voPXJ/x0j+ZL45KF1pJd9+q2jLIRMfvEshkA==}
 
+  prom-client@15.1.3:
+    resolution: {integrity: sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g==}
+    engines: {node: ^16 || ^18 || >=20}
+
   property-expr@2.0.6:
     resolution: {integrity: sha512-SVtmxhRE/CGkn3eZY1T6pC8Nln6Fr/lu1mKSgRud0eC73whjGfoAogbn78LkD8aFL0zz3bAFerKSnOl7NlErBA==}
 
@@ -1931,6 +1945,9 @@ packages:
     resolution: {integrity: sha512-g9ljZiwki/LfxmQADO3dEY1CbpmXT5Hm2fJ+QaGKwSXUylMybePR7/67YW7jOrrvjEgL1Fmz5kzyAjWVWLlucg==}
     engines: {node: '>=6'}
 
+  tdigest@0.1.2:
+    resolution: {integrity: sha512-+G0LLgjjo9BZX2MfdvPfH+MKLCrxlXSYec5DaPYP1fe6Iyhf0/fSmJ0bFiZ1F8BT6cGXl2LpltQptzjXKWEkKA==}
+
   thread-stream@4.0.0:
     resolution: {integrity: sha512-4iMVL6HAINXWf1ZKZjIPcz5wYaOdPhtO8ATvZ+Xqp3BTdaqtAwQkNmKORqcIo5YkQqGXq5cwfswDwMqqQNrpJA==}
     engines: {node: '>=20'}
@@ -2702,6 +2719,8 @@ snapshots:
   '@next/swc-win32-x64-msvc@16.1.1':
     optional: true
 
+  '@opentelemetry/api@1.9.0': {}
+
   '@pinojs/redact@0.4.0': {}
 
   '@rolldown/pluginutils@1.0.0-beta.53': {}
@@ -3005,6 +3024,8 @@ snapshots:
     dependencies:
       require-from-string: 2.0.2
 
+  bintrees@1.0.2: {}
+
   browserslist@4.28.1:
     dependencies:
       baseline-browser-mapping: 2.9.14
@@ -3073,7 +3094,9 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
-  drizzle-orm@0.45.1: {}
+  drizzle-orm@0.45.1(@opentelemetry/api@1.9.0):
+    optionalDependencies:
+      '@opentelemetry/api': 1.9.0
 
   electron-to-chromium@1.5.267: {}
 
@@ -3340,7 +3363,7 @@ snapshots:
 
   nanoid@3.3.11: {}
 
-  next@16.1.1(@babel/core@7.28.5)(react-dom@19.2.3(react@19.2.3))(react@19.2.3):
+  next@16.1.1(@babel/core@7.28.5)(@opentelemetry/api@1.9.0)(react-dom@19.2.3(react@19.2.3))(react@19.2.3):
     dependencies:
       '@next/env': 16.1.1
       '@swc/helpers': 0.5.15
@@ -3359,6 +3382,7 @@ snapshots:
       '@next/swc-linux-x64-musl': 16.1.1
       '@next/swc-win32-arm64-msvc': 16.1.1
       '@next/swc-win32-x64-msvc': 16.1.1
+      '@opentelemetry/api': 1.9.0
       sharp: 0.34.5
     transitivePeerDependencies:
       - '@babel/core'
@@ -3424,6 +3448,11 @@ snapshots:
 
   process-warning@5.0.0: {}
 
+  prom-client@15.1.3:
+    dependencies:
+      '@opentelemetry/api': 1.9.0
+      tdigest: 0.1.2
+
   property-expr@2.0.6: {}
 
   punycode@2.3.1: {}
@@ -3584,6 +3613,10 @@ snapshots:
 
   tapable@2.3.0: {}
 
+  tdigest@0.1.2:
+    dependencies:
+      bintrees: 1.0.2
+
   thread-stream@4.0.0:
     dependencies:
       real-require: 0.2.0
@@ -3649,7 +3682,7 @@ snapshots:
       jiti: 2.6.1
       lightningcss: 1.30.2
 
-  vitest@4.0.16(@types/node@20.19.27)(jiti@2.6.1)(jsdom@27.4.0)(lightningcss@1.30.2):
+  vitest@4.0.16(@opentelemetry/api@1.9.0)(@types/node@20.19.27)(jiti@2.6.1)(jsdom@27.4.0)(lightningcss@1.30.2):
     dependencies:
       '@vitest/expect': 4.0.16
       '@vitest/mocker': 4.0.16(vite@7.3.1(@types/node@20.19.27)(jiti@2.6.1)(lightningcss@1.30.2))
@@ -3672,6 +3705,7 @@ snapshots:
       vite: 7.3.1(@types/node@20.19.27)(jiti@2.6.1)(lightningcss@1.30.2)
       why-is-node-running: 2.3.0
     optionalDependencies:
+      '@opentelemetry/api': 1.9.0
       '@types/node': 20.19.27
       jsdom: 27.4.0
     transitivePeerDependencies:
diff --git a/src/app/api/cron/garmin-sync/route.ts b/src/app/api/cron/garmin-sync/route.ts
index f4ab4e2..8694a92 100644
--- a/src/app/api/cron/garmin-sync/route.ts
+++ b/src/app/api/cron/garmin-sync/route.ts
@@ -13,6 +13,11 @@ import {
   fetchIntensityMinutes,
   isTokenExpired,
 } from "@/lib/garmin";
+import {
+  activeUsersGauge,
+  garminSyncDuration,
+  garminSyncTotal,
+} from "@/lib/metrics";
 import { createPocketBaseClient } from "@/lib/pocketbase";
 import type { GarminTokens, User } from "@/types";
 
@@ -34,6 +39,8 @@ export async function POST(request: Request) {
     return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
   }
 
+  const syncStartTime = Date.now();
+
   const result: SyncResult = {
     success: true,
     usersProcessed: 0,
@@ -129,10 +136,17 @@ export async function POST(request: Request) {
       });
 
       result.usersProcessed++;
+      garminSyncTotal.inc({ status: "success" });
     } catch {
       result.errors++;
+      garminSyncTotal.inc({ status: "failure" });
     }
   }
 
+  // Record sync duration and active users
+  const syncDurationSeconds = (Date.now() - syncStartTime) / 1000;
+  garminSyncDuration.observe(syncDurationSeconds);
+  activeUsersGauge.set(result.usersProcessed);
+
   return NextResponse.json(result);
 }
diff --git a/src/app/api/metrics/route.test.ts b/src/app/api/metrics/route.test.ts
new file mode 100644
index 0000000..142a91b
--- /dev/null
+++ b/src/app/api/metrics/route.test.ts
@@ -0,0 +1,171 @@
+// ABOUTME: Tests for Prometheus metrics endpoint used for production monitoring.
+// ABOUTME: Validates metrics format, content type, and custom metric inclusion.
+
+import * as promClient from "prom-client";
+import { beforeEach, describe, expect, it, vi } from "vitest";
+
+describe("GET /api/metrics", () => {
+  beforeEach(async () => {
+    // Clear the registry before each test to avoid metric conflicts
+    promClient.register.clear();
+    vi.resetModules();
+  });
+
+  describe("response format", () => {
+    it("returns 200 status", async () => {
+      const { GET } = await import("./route");
+      const response = await GET();
+
+      expect(response.status).toBe(200);
+    });
+
+    it("returns Prometheus content type", async () => {
+      const { GET } = await import("./route");
+      const response = await GET();
+
+      expect(response.headers.get("Content-Type")).toBe(
+        "text/plain; version=0.0.4; charset=utf-8",
+      );
+    });
+
+    it("returns text body with metrics", async () => {
+      const { GET } = await import("./route");
+      const response = await GET();
+      const body = await response.text();
+
+      expect(body).toBeDefined();
+      expect(body.length).toBeGreaterThan(0);
+    });
+  });
+
+  describe("Node.js default metrics", () => {
+    it("includes nodejs heap metrics", async () => {
+      const { GET } = await import("./route");
+      const response = await GET();
+      const body = await response.text();
+
+      expect(body).toContain("nodejs_");
+    });
+
+    it("includes process metrics", async () => {
+      const { GET } = await import("./route");
+      const response = await GET();
+      const body = await response.text();
+
+      expect(body).toContain("process_");
+    });
+  });
+
+  describe("custom application metrics", () => {
+    it("includes phaseflow_garmin_sync_total metric definition", async () => {
+      const { GET } = await import("./route");
+      const response = await GET();
+      const body = await response.text();
+
+      expect(body).toContain("# TYPE phaseflow_garmin_sync_total counter");
+      expect(body).toContain("# HELP phaseflow_garmin_sync_total");
+    });
+
+    it("includes phaseflow_garmin_sync_duration_seconds metric definition", async () => {
+      const { GET } = await import("./route");
+      const response = await GET();
+      const body = await response.text();
+
+      expect(body).toContain(
+        "# TYPE phaseflow_garmin_sync_duration_seconds histogram",
+      );
+      expect(body).toContain("# HELP phaseflow_garmin_sync_duration_seconds");
+    });
+
+    it("includes phaseflow_email_sent_total metric definition", async () => {
+      const { GET } = await import("./route");
+      const response = await GET();
+      const body = await response.text();
+
+      expect(body).toContain("# TYPE phaseflow_email_sent_total counter");
+      expect(body).toContain("# HELP phaseflow_email_sent_total");
+    });
+
+    it("includes phaseflow_decision_engine_calls_total metric definition", async () => {
+      const { GET } = await import("./route");
+      const response = await GET();
+      const body = await response.text();
+
+      expect(body).toContain(
+        "# TYPE phaseflow_decision_engine_calls_total counter",
+      );
+      expect(body).toContain("# HELP phaseflow_decision_engine_calls_total");
+    });
+
+    it("includes phaseflow_active_users metric definition", async () => {
+      const { GET } = await import("./route");
+      const response = await GET();
+      const body = await response.text();
+
+      expect(body).toContain("# TYPE phaseflow_active_users gauge");
+      expect(body).toContain("# HELP phaseflow_active_users");
+    });
+  });
+
+  describe("metric values", () => {
+    it("incremented garmin sync total is reflected in metrics output", async () => {
+      const { garminSyncTotal } = await import("@/lib/metrics");
+      garminSyncTotal.inc({ status: "success" });
+
+      const { GET } = await import("./route");
+      const response = await GET();
+      const body = await response.text();
+
+      expect(body).toContain('phaseflow_garmin_sync_total{status="success"} 1');
+    });
+
+    it("incremented email sent total is reflected in metrics output", async () => {
+      const { emailSentTotal } = await import("@/lib/metrics");
+      emailSentTotal.inc({ type: "daily" });
+
+      const { GET } = await import("./route");
+      const response = await GET();
+      const body = await response.text();
+
+      expect(body).toContain('phaseflow_email_sent_total{type="daily"} 1');
+    });
+
+    it("set active users gauge is reflected in metrics output", async () => {
+      const { activeUsersGauge } = await import("@/lib/metrics");
+      activeUsersGauge.set(25);
+
+      const { GET } = await import("./route");
+      const response = await GET();
+      const body = await response.text();
+
+      expect(body).toContain("phaseflow_active_users 25");
+    });
+  });
+
+  describe("Prometheus format validation", () => {
+    it("produces valid Prometheus text format with TYPE comments", async () => {
+      const { GET } = await import("./route");
+      const response = await GET();
+      const body = await response.text();
+
+      // Each metric should have TYPE and HELP lines
+      const lines = body.split("\n");
+      const typeLines = lines.filter((line) => line.startsWith("# TYPE"));
+      const helpLines = lines.filter((line) => line.startsWith("# HELP"));
+
+      // Should have type and help for our custom metrics
+      expect(typeLines.length).toBeGreaterThanOrEqual(5);
+      expect(helpLines.length).toBeGreaterThanOrEqual(5);
+    });
+
+    it("metric names follow Prometheus naming convention", async () => {
+      const { GET } = await import("./route");
+      const response = await GET();
+      const body = await response.text();
+
+      // Prometheus metric names should be snake_case with optional prefix
+      // Our custom metrics follow phaseflow_* pattern
+      expect(body).toMatch(/phaseflow_[a-z_]+/);
+    });
+  });
+});
diff --git a/src/app/api/metrics/route.ts b/src/app/api/metrics/route.ts
new file mode 100644
index 0000000..c817820
--- /dev/null
+++ b/src/app/api/metrics/route.ts
@@ -0,0 +1,16 @@
+// ABOUTME: Prometheus metrics endpoint for production monitoring and scraping.
+// ABOUTME: Returns application metrics in Prometheus text format.
+
+import { NextResponse } from "next/server";
+import { metricsRegistry } from "@/lib/metrics";
+
+export async function GET(): Promise<NextResponse> {
+  const metrics = await metricsRegistry.metrics();
+
+  return new NextResponse(metrics, {
+    status: 200,
+    headers: {
+      "Content-Type": metricsRegistry.contentType,
+    },
+  });
+}
diff --git a/src/lib/decision-engine.ts b/src/lib/decision-engine.ts
index 78b22ee..cc32aca 100644
--- a/src/lib/decision-engine.ts
+++ b/src/lib/decision-engine.ts
@@ -1,5 +1,6 @@
 // ABOUTME: Training decision engine based on biometric and cycle data.
 // ABOUTME: Implements priority-based rules for daily training recommendations.
+import { decisionEngineCallsTotal } from "@/lib/metrics";
 import type { DailyData, Decision, OverrideType } from "@/types";
 
 // Override priority order - checked before algorithmic rules
@@ -80,14 +81,18 @@ export function getDecisionWithOverrides(
   // Check overrides first, in priority order: flare > stress > sleep > pms
   for (const override of OVERRIDE_PRIORITY) {
     if (overrides.includes(override)) {
-      return {
+      const decision: Decision = {
         status: "REST",
         reason: OVERRIDE_REASONS[override],
         icon: "🛑",
       };
+      decisionEngineCallsTotal.inc({ decision: decision.status });
+      return decision;
     }
   }
 
   // No active overrides - fall through to algorithmic rules
-  return getTrainingDecision(data);
+  const decision = getTrainingDecision(data);
+  decisionEngineCallsTotal.inc({ decision: decision.status });
+  return decision;
 }
diff --git a/src/lib/email.ts b/src/lib/email.ts
index 2688768..01ebe9a 100644
--- a/src/lib/email.ts
+++ b/src/lib/email.ts
@@ -2,6 +2,8 @@
 // ABOUTME: Sends daily training notifications and period confirmation emails.
 import { Resend } from "resend";
 
+import { emailSentTotal } from "@/lib/metrics";
+
 const resend = new Resend(process.env.RESEND_API_KEY);
 
 const EMAIL_FROM = process.env.EMAIL_FROM || "phaseflow@example.com";
@@ -57,6 +59,8 @@ Auto-generated by PhaseFlow`;
     subject,
     text: body,
   });
+
+  emailSentTotal.inc({ type: "daily" });
 }
 
 export async function sendPeriodConfirmationEmail(
@@ -114,4 +118,6 @@ Auto-generated by PhaseFlow`;
     subject,
     text: body,
   });
+
+  emailSentTotal.inc({ type: "warning" });
 }
diff --git a/src/lib/metrics.test.ts b/src/lib/metrics.test.ts
new file mode 100644
index 0000000..62d9eb4
--- /dev/null
+++ b/src/lib/metrics.test.ts
@@ -0,0 +1,200 @@
+// ABOUTME: Tests for the Prometheus metrics collection module.
+// ABOUTME: Validates metric registration, counters, gauges, histograms per observability spec.
+
+import * as promClient from "prom-client";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+
+describe("metrics", () => {
+  beforeEach(async () => {
+    // Clear the default registry before each test
+    promClient.register.clear();
+    vi.resetModules();
+  });
+
+  afterEach(() => {
+    promClient.register.clear();
+  });
+
+  describe("registry", () => {
+    it("exports a Prometheus registry", async () => {
+      const { metricsRegistry } = await import("./metrics");
+      expect(metricsRegistry).toBeDefined();
+      expect(typeof metricsRegistry.metrics).toBe("function");
+    });
+
+    it("collects default Node.js metrics", async () => {
+      const { metricsRegistry } = await import("./metrics");
+      const metrics = await metricsRegistry.metrics();
+
+      // Should include standard Node.js metrics
+      expect(metrics).toContain("nodejs_");
+      expect(metrics).toContain("process_");
+    });
+  });
+
+  describe("custom metrics - garmin sync", () => {
+    it("exports phaseflow_garmin_sync_total counter", async () => {
+      const { garminSyncTotal } = await import("./metrics");
+      expect(garminSyncTotal).toBeDefined();
+      expect(garminSyncTotal.inc).toBeDefined();
+    });
+
+    it("increments garmin sync total with success status", async () => {
+      const { garminSyncTotal, metricsRegistry } = await import("./metrics");
+      garminSyncTotal.inc({ status: "success" });
+
+      const metrics = await metricsRegistry.metrics();
+      expect(metrics).toContain(
+        'phaseflow_garmin_sync_total{status="success"} 1',
+      );
+    });
+
+    it("increments garmin sync total with failure status", async () => {
+      const { garminSyncTotal, metricsRegistry } = await import("./metrics");
+      garminSyncTotal.inc({ status: "failure" });
+
+      const metrics = await metricsRegistry.metrics();
+      expect(metrics).toContain(
+        'phaseflow_garmin_sync_total{status="failure"} 1',
+      );
+    });
+
+    it("exports phaseflow_garmin_sync_duration_seconds histogram", async () => {
+      const { garminSyncDuration } = await import("./metrics");
+      expect(garminSyncDuration).toBeDefined();
+      expect(garminSyncDuration.observe).toBeDefined();
+    });
+
+    it("records garmin sync duration", async () => {
+      const { garminSyncDuration, metricsRegistry } = await import("./metrics");
+      garminSyncDuration.observe(1.5);
+
+      const metrics = await metricsRegistry.metrics();
+      expect(metrics).toContain(
+        "phaseflow_garmin_sync_duration_seconds_bucket",
+      );
+      expect(metrics).toContain(
+        "phaseflow_garmin_sync_duration_seconds_sum 1.5",
+      );
+      expect(metrics).toContain(
+        "phaseflow_garmin_sync_duration_seconds_count 1",
+      );
+    });
+  });
+
+  describe("custom metrics - email", () => {
+    it("exports phaseflow_email_sent_total counter", async () => {
+      const { emailSentTotal } = await import("./metrics");
+      expect(emailSentTotal).toBeDefined();
+      expect(emailSentTotal.inc).toBeDefined();
+    });
+
+    it("increments email sent total with daily type", async () => {
+      const { emailSentTotal, metricsRegistry } = await import("./metrics");
+      emailSentTotal.inc({ type: "daily" });
+
+      const metrics = await metricsRegistry.metrics();
+      expect(metrics).toContain('phaseflow_email_sent_total{type="daily"} 1');
+    });
+
+    it("increments email sent total with warning type", async () => {
+      const { emailSentTotal, metricsRegistry } = await import("./metrics");
+      emailSentTotal.inc({ type: "warning" });
+
+      const metrics = await metricsRegistry.metrics();
+      expect(metrics).toContain('phaseflow_email_sent_total{type="warning"} 1');
+    });
+  });
+
+  describe("custom metrics - decision engine", () => {
+    it("exports phaseflow_decision_engine_calls_total counter", async () => {
+      const { decisionEngineCallsTotal } = await import("./metrics");
+      expect(decisionEngineCallsTotal).toBeDefined();
+      expect(decisionEngineCallsTotal.inc).toBeDefined();
+    });
+
+    it("increments decision engine calls with decision label", async () => {
+      const { decisionEngineCallsTotal, metricsRegistry } = await import(
+        "./metrics"
+      );
+      decisionEngineCallsTotal.inc({ decision: "REST" });
+
+      const metrics = await metricsRegistry.metrics();
+      expect(metrics).toContain(
+        'phaseflow_decision_engine_calls_total{decision="REST"} 1',
+      );
+    });
+
+    it("tracks multiple decision types", async () => {
+      const { decisionEngineCallsTotal, metricsRegistry } = await import(
+        "./metrics"
+      );
+      decisionEngineCallsTotal.inc({ decision: "REST" });
+      decisionEngineCallsTotal.inc({ decision: "REST" });
+      decisionEngineCallsTotal.inc({ decision: "GENTLE" });
+      decisionEngineCallsTotal.inc({ decision: "GO" });
+
+      const metrics = await metricsRegistry.metrics();
+      expect(metrics).toContain(
+        'phaseflow_decision_engine_calls_total{decision="REST"} 2',
+      );
+      expect(metrics).toContain(
+        'phaseflow_decision_engine_calls_total{decision="GENTLE"} 1',
+      );
+      expect(metrics).toContain(
+        'phaseflow_decision_engine_calls_total{decision="GO"} 1',
+      );
+    });
+  });
+
+  describe("custom metrics - active users", () => {
+    it("exports phaseflow_active_users gauge", async () => {
+      const { activeUsersGauge } = await import("./metrics");
+      expect(activeUsersGauge).toBeDefined();
+      expect(activeUsersGauge.set).toBeDefined();
+    });
+
+    it("sets active users count", async () => {
+      const { activeUsersGauge, metricsRegistry } = await import("./metrics");
+      activeUsersGauge.set(42);
+
+      const metrics = await metricsRegistry.metrics();
+      expect(metrics).toContain("phaseflow_active_users 42");
+    });
+
+    it("can increment and decrement active users gauge", async () => {
+      const { activeUsersGauge, metricsRegistry } = await import("./metrics");
+      activeUsersGauge.set(10);
+      activeUsersGauge.inc();
+      activeUsersGauge.dec();
+
+      const metrics = await metricsRegistry.metrics();
+      expect(metrics).toContain("phaseflow_active_users 10");
+    });
+  });
+
+  describe("metrics format", () => {
+    it("produces valid Prometheus text format", async () => {
+      const { metricsRegistry, garminSyncTotal, emailSentTotal } = await import(
+        "./metrics"
+      );
+      garminSyncTotal.inc({ status: "success" });
+      emailSentTotal.inc({ type: "daily" });
+
+      const metrics = await metricsRegistry.metrics();
+
+      // Should contain TYPE and HELP comments for custom metrics
+      expect(metrics).toContain("# TYPE phaseflow_garmin_sync_total counter");
+      expect(metrics).toContain("# HELP phaseflow_garmin_sync_total");
+      expect(metrics).toContain("# TYPE phaseflow_email_sent_total counter");
+      expect(metrics).toContain("# HELP phaseflow_email_sent_total");
+    });
+
+    it("returns content type for Prometheus", async () => {
+      const { metricsRegistry } = await import("./metrics");
+      expect(metricsRegistry.contentType).toBe(
+        "text/plain; version=0.0.4; charset=utf-8",
+      );
+    });
+  });
+});
diff --git a/src/lib/metrics.ts b/src/lib/metrics.ts
new file mode 100644
index 0000000..d1a5f4d
--- /dev/null
+++ b/src/lib/metrics.ts
@@ -0,0 +1,49 @@
+// ABOUTME: Prometheus metrics collection module for production monitoring.
+// ABOUTME: Exposes counters, gauges, and histograms for Garmin sync, email, and decision engine.
+
+import * as promClient from "prom-client";
+
+// Create a new registry for our application metrics
+export const metricsRegistry = new promClient.Registry();
+
+// Collect default Node.js metrics (heap, event loop, etc.)
+promClient.collectDefaultMetrics({ register: metricsRegistry });
+
+// Custom metric: Garmin sync operations counter
+export const garminSyncTotal = new promClient.Counter({
+  name: "phaseflow_garmin_sync_total",
+  help: "Total number of Garmin sync operations",
+  labelNames: ["status"] as const,
+  registers: [metricsRegistry],
+});
+
+// Custom metric: Garmin sync duration histogram
+export const garminSyncDuration = new promClient.Histogram({
+  name: "phaseflow_garmin_sync_duration_seconds",
+  help: "Duration of Garmin sync operations in seconds",
+  buckets: [0.1, 0.5, 1, 2, 5, 10],
+  registers: [metricsRegistry],
+});
+
+// Custom metric: Email sent counter
+export const emailSentTotal = new promClient.Counter({
+  name: "phaseflow_email_sent_total",
+  help: "Total number of emails sent",
+  labelNames: ["type"] as const,
+  registers: [metricsRegistry],
+});
+
+// Custom metric: Decision engine calls counter
+export const decisionEngineCallsTotal = new promClient.Counter({
+  name: "phaseflow_decision_engine_calls_total",
+  help: "Total number of decision engine calls",
+  labelNames: ["decision"] as const,
+  registers: [metricsRegistry],
+});
+
+// Custom metric: Active users gauge
+export const activeUsersGauge = new promClient.Gauge({
+  name: "phaseflow_active_users",
+  help: "Number of users with activity in the last 24 hours",
+  registers: [metricsRegistry],
+});