---
name: uptime-monitoring
description: Implement uptime monitoring and status page systems for tracking service availability. Use when monitoring application uptime, creating status pages, or implementing health checks.
---

# Uptime Monitoring

## Overview

Set up comprehensive uptime monitoring with health checks, status pages, and incident tracking to ensure visibility into service availability.

## When to Use

- Service availability tracking
- Health check implementation
- Status page creation
- Incident management
- SLA monitoring

## Instructions

### 1. **Health Check Endpoints**

```javascript
// Node.js health check
const express = require('express');
const app = express();

app.get('/health', (req, res) => {
  res.json({
    status: 'ok',
    timestamp: new Date().toISOString(),
    uptime: process.uptime()
  });
});

app.get('/health/deep', async (req, res) => {
  const health = {
    status: 'ok',
    checks: {
      database: 'unknown',
      cache: 'unknown',
      externalApi: 'unknown'
    }
  };

  try {
    const dbResult = await db.query('SELECT 1');
    health.checks.database = dbResult ? 'ok' : 'error';
  } catch {
    health.checks.database = 'error';
    health.status = 'degraded';
  }

  try {
    const cacheResult = await redis.ping();
    health.checks.cache = cacheResult === 'PONG' ? 'ok' : 'error';
  } catch {
    health.checks.cache = 'error';
  }

  try {
    const response = await fetch('https://api.example.com/health');
    health.checks.externalApi = response.ok ? 'ok' : 'error';
  } catch {
    health.checks.externalApi = 'error';
  }

  const statusCode = health.status === 'ok' ? 200 : 503;
  res.status(statusCode).json(health);
});

app.get('/readiness', async (req, res) => {
  try {
    const dbCheck = await db.query('SELECT 1');
    const cacheCheck = await redis.ping();

    if (dbCheck && cacheCheck === 'PONG') {
      res.json({ ready: true });
    } else {
      res.status(503).json({ ready: false });
    }
  } catch {
    res.status(503).json({ ready: false });
  }
});

app.get('/liveness', (req, res) => {
  res.json({ alive: true });
});
```

### 2. **Python Health Checks**

```python
from flask import Flask, jsonify
import time

app = Flask(__name__)
startup_time = time.time()

def get_uptime():
    return int(time.time() - startup_time)

@app.route('/health')
def health():
    return jsonify({
        'status': 'ok',
        'uptime_seconds': get_uptime()
    }), 200

@app.route('/health/deep')
def health_deep():
    health_status = {
        'status': 'ok',
        'checks': {
            'database': 'unknown',
            'cache': 'unknown'
        }
    }

    try:
        db.session.execute('SELECT 1')
        health_status['checks']['database'] = 'ok'
    except:
        health_status['checks']['database'] = 'error'
        health_status['status'] = 'degraded'

    try:
        cache.get('_health')
        health_status['checks']['cache'] = 'ok'
    except:
        health_status['checks']['cache'] = 'error'

    status_code = 200 if health_status['status'] == 'ok' else 503
    return jsonify(health_status), status_code

@app.route('/readiness')
def readiness():
    try:
        db.session.execute('SELECT 1')
        return jsonify({'ready': True}), 200
    except:
        return jsonify({'ready': False}), 503
```

### 3. **Uptime Monitor with Heartbeat**

```javascript
// heartbeat.js
const axios = require('axios');

class UptimeMonitor {
  constructor(config = {}) {
    this.checkInterval = config.checkInterval || 60000;
    this.timeout = config.timeout || 5000;
    this.endpoints = config.endpoints || [];
  }

  async checkEndpoint(endpoint) {
    const startTime = Date.now();

    try {
      const response = await axios.get(endpoint.url, {
        timeout: this.timeout,
        validateStatus: (s) => s >= 200 && s < 300
      });

      const check = {
        endpoint: endpoint.name,
        status: 'up',
        responseTime: Date.now() - startTime,
        timestamp: new Date()
      };

      await this.saveCheck(check);
      return check;
    } catch (error) {
      const check = {
        endpoint: endpoint.name,
        status: 'down',
        responseTime: Date.now() - startTime,
        timestamp: new Date(),
        error: error.message
      };

      await this.saveCheck(check);
      return check;
    }
  }

  async saveCheck(check) {
    try {
      await db.query(
        'INSERT INTO uptime_checks (endpoint, status, response_time, timestamp) VALUES (?, ?, ?, ?)',
        [check.endpoint, check.status, check.responseTime, check.timestamp]
      );
    } catch (error) {
      console.error('Failed to save check:', error);
    }
  }

  async runChecks() {
    return Promise.all(
      this.endpoints.map(e => this.checkEndpoint(e))
    );
  }

  start() {
    this.runChecks();
    this.interval = setInterval(() => this.runChecks(), this.checkInterval);
  }

  stop() {
    if (this.interval) clearInterval(this.interval);
  }

  async getStats(endpoint, hours = 24) {
    const [stats] = await db.query(`
      SELECT
        COUNT(*) as total_checks,
        SUM(CASE WHEN status = 'up' THEN 1 ELSE 0 END) as uptime_checks,
        AVG(response_time) as avg_response_time
      FROM uptime_checks
      WHERE endpoint = ? AND timestamp > DATE_SUB(NOW(), INTERVAL ? HOUR)
    `, [endpoint, hours]);
    return stats[0];
  }
}

module.exports = UptimeMonitor;
```

### 4. **Public Status Page API**

```javascript
// status-page-api.js
const express = require('express');
const router = express.Router();

router.get('/api/status', async (req, res) => {
  try {
    const endpoints = await db.query(`
      SELECT DISTINCT endpoint FROM uptime_checks
    `);

    const status = {
      page: { name: 'My Service Status', updated_at: new Date().toISOString() },
      components: []
    };

    for (const { endpoint } of endpoints) {
      const [lastCheck] = await db.query(`
        SELECT status FROM uptime_checks
        WHERE endpoint = ? ORDER BY timestamp DESC LIMIT 1
      `, [endpoint]);

      status.components.push({
        id: endpoint,
        name: endpoint,
        status: lastCheck?.status === 'up' ? 'operational' : 'major_outage'
      });
    }

    const allUp = status.components.every(c => c.status === 'operational');
    status.status = {
      overall: allUp ? 'all_operational' : 'major_outage'
    };

    res.json(status);
  } catch (error) {
    res.status(500).json({ error: 'Failed to fetch status' });
  }
});

router.get('/api/status/uptime/:endpoint', async (req, res) => {
  try {
    const stats = await db.query(`
      SELECT
        DATE(timestamp) as date,
        COUNT(*) as total,
        SUM(CASE WHEN status = 'up' THEN 1 ELSE 0 END) as uptime
      FROM uptime_checks
      WHERE endpoint = ? AND timestamp > DATE_SUB(NOW(), INTERVAL 30 DAY)
      GROUP BY DATE(timestamp)
      ORDER BY date DESC
    `, [req.params.endpoint]);

    res.json(stats);
  } catch (error) {
    res.status(500).json({ error: 'Failed to fetch statistics' });
  }
});

module.exports = router;
```

### 5. **Kubernetes Health Probes**

```yaml
apiVersion: apps/v1
kind: Deployment
spec:
  template:
    spec:
      containers:
      - name: api-service
        image: api-service:latest

        startupProbe:
          httpGet:
            path: /health
            port: 3000
          initialDelaySeconds: 0
          periodSeconds: 10
          failureThreshold: 30

        readinessProbe:
          httpGet:
            path: /readiness
            port: 3000
          initialDelaySeconds: 5
          periodSeconds: 5
          failureThreshold: 3

        livenessProbe:
          httpGet:
            path: /liveness
            port: 3000
          initialDelaySeconds: 15
          periodSeconds: 20
          failureThreshold: 3
```

## Best Practices

### ✅ DO
- Implement comprehensive health checks
- Check all critical dependencies
- Use appropriate timeout values
- Track response times
- Store check history
- Monitor uptime trends
- Alert on status changes
- Use standard HTTP status codes

### ❌ DON'T
- Check only application process
- Ignore external dependencies
- Set timeouts too low
- Alert on every failure
- Use health checks for load balancing
- Expose sensitive information

## SLA Compliance Calculation

```javascript
function calculateSLA(upChecks, totalChecks) {
  const uptime = (upChecks / totalChecks) * 100;
  return {
    uptime_percentage: uptime.toFixed(4),
    meets_99_9: uptime >= 99.9,
    meets_99_99: uptime >= 99.99
  };
}
```