unitforge/scripts/health_check.sh

#!/bin/bash
# Health check script for UnitForge CI/CD workflows
# Tests basic functionality of the running application

set -e

# Configuration
HOST=${HOST:-localhost}
PORT=${PORT:-8000}
TIMEOUT=${TIMEOUT:-30}
MAX_RETRIES=${MAX_RETRIES:-5}
RETRY_DELAY=${RETRY_DELAY:-2}

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

# Helper functions
log_info() {
    echo -e "${GREEN}[INFO]${NC} $1"
}

log_warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# Check if application is responding
check_health() {
    local url="http://${HOST}:${PORT}/health"
    local retry_count=0

    log_info "Checking health endpoint: $url"

    while [ $retry_count -lt "$MAX_RETRIES" ]; do
        if curl -s -f --max-time "$TIMEOUT" "$url" > /dev/null 2>&1; then
            log_info "Health check passed"
            return 0
        fi

        retry_count=$((retry_count + 1))
        log_warn "Health check failed (attempt $retry_count/$MAX_RETRIES)"

        if [ $retry_count -lt "$MAX_RETRIES" ]; then
            log_info "Retrying in ${RETRY_DELAY} seconds..."
            sleep "$RETRY_DELAY"
        fi
    done

    log_error "Health check failed after $MAX_RETRIES attempts"
    return 1
}

# Check if main page loads
check_main_page() {
    local url="http://${HOST}:${PORT}/"
    log_info "Checking main page: $url"

    local response
    response=$(curl -s -w "%{http_code}" --max-time "$TIMEOUT" "$url")
    local http_code="${response: -3}"

    if [ "$http_code" = "200" ]; then
        log_info "Main page check passed (HTTP $http_code)"
        return 0
    else
        log_error "Main page check failed (HTTP $http_code)"
        return 1
    fi
}

# Check API endpoints
check_api() {
    local base_url="http://${HOST}:${PORT}/api"
    log_info "Checking API endpoints"

    # Check API health
    local api_health_url="${base_url}/health"
    if curl -s -f --max-time "$TIMEOUT" "$api_health_url" > /dev/null 2>&1; then
        log_info "API health endpoint passed"
    else
        log_warn "API health endpoint failed or not available"
    fi

    # Check API version
    local api_version_url="${base_url}/version"
    if curl -s -f --max-time "$TIMEOUT" "$api_version_url" > /dev/null 2>&1; then
        log_info "API version endpoint passed"
    else
        log_warn "API version endpoint failed or not available"
    fi

    return 0
}

# Check static assets
check_static_assets() {
    log_info "Checking static assets"

    local assets=(
        "/static/css/style.css"
        "/static/js/app.js"
        "/static/vendor/bootstrap/css/bootstrap.min.css"
        "/static/vendor/fontawesome/css/all.min.css"
    )

    local failed_assets=0

    for asset in "${assets[@]}"; do
        local url="http://${HOST}:${PORT}${asset}"
        if curl -s -f --max-time "$TIMEOUT" "$url" > /dev/null 2>&1; then
            log_info "Asset check passed: $asset"
        else
            log_warn "Asset check failed: $asset"
            failed_assets=$((failed_assets + 1))
        fi
    done

    if [ $failed_assets -eq 0 ]; then
        log_info "All static assets available"
        return 0
    else
        log_warn "$failed_assets static assets failed to load"
        return 0  # Don't fail health check for missing assets
    fi
}

# Performance test
check_performance() {
    log_info "Running basic performance test"

    local url="http://${HOST}:${PORT}/"
    local response_time

    # Test response time
    local response_time
    response_time=$(curl -s -w "%{time_total}" --max-time "$TIMEOUT" -o /dev/null "$url")

    if curl -s -w "%{time_total}" --max-time "$TIMEOUT" -o /dev/null "$url" > /dev/null 2>&1; then
        log_info "Response time: ${response_time}s"

        # Check if response time is reasonable (< 5 seconds)
        if (( $(echo "$response_time < 5.0" | bc -l) )); then
            log_info "Performance check passed"
            return 0
        else
            log_warn "Performance check warning: slow response time (${response_time}s)"
            return 0  # Don't fail health check for slow response
        fi
    else
        log_error "Performance check failed: no response"
        return 1
    fi
}

# Memory usage check (if running in container)
check_memory() {
    if command -v docker > /dev/null 2>&1 && [ -n "$CONTAINER_NAME" ]; then
        log_info "Checking container memory usage"

        local memory_usage
        memory_usage=$(docker stats "$CONTAINER_NAME" --no-stream --format "{{.MemUsage}}" | cut -d'/' -f1)

        if [ -n "$memory_usage" ]; then
            log_info "Memory usage: $memory_usage"
        else
            log_warn "Could not determine memory usage"
        fi
    fi
}

# Wait for application to start
wait_for_startup() {
    log_info "Waiting for application to start..."
    local startup_timeout=60
    local elapsed=0

    while [ $elapsed -lt $startup_timeout ]; do
        if curl -s --max-time 5 "http://${HOST}:${PORT}/" > /dev/null 2>&1; then
            log_info "Application is responding"
            return 0
        fi

        sleep 5
        elapsed=$((elapsed + 5))
        log_info "Waiting... (${elapsed}s/${startup_timeout}s)"
    done

    log_error "Application failed to start within ${startup_timeout} seconds"
    return 1
}

# Main health check function
run_health_check() {
    log_info "Starting UnitForge health check"
    log_info "Target: http://${HOST}:${PORT}"

    local failed_checks=0

    # Wait for startup if needed
    if ! curl -s --max-time 5 "http://${HOST}:${PORT}/" > /dev/null 2>&1; then
        wait_for_startup || return 1
    fi

    # Run all checks
    check_health || failed_checks=$((failed_checks + 1))
    check_main_page || failed_checks=$((failed_checks + 1))
    check_api || failed_checks=$((failed_checks + 1))
    check_static_assets || true  # Don't count static asset failures
    check_performance || true    # Don't count performance warnings
    check_memory || true         # Don't count memory check failures

    # Summary
    if [ $failed_checks -eq 0 ]; then
        log_info "✅ All health checks passed"
        return 0
    else
        log_error "❌ $failed_checks health checks failed"
        return 1
    fi
}

# Usage information
usage() {
    echo "Usage: $0 [OPTIONS]"
    echo ""
    echo "Options:"
    echo "  -h, --host HOST         Target host (default: localhost)"
    echo "  -p, --port PORT         Target port (default: 8000)"
    echo "  -t, --timeout TIMEOUT   Request timeout in seconds (default: 30)"
    echo "  -r, --retries RETRIES   Maximum retry attempts (default: 5)"
    echo "  -d, --delay DELAY       Retry delay in seconds (default: 2)"
    echo "  -c, --container NAME    Container name for memory checks"
    echo "  --help                  Show this help message"
    echo ""
    echo "Environment variables:"
    echo "  HOST                    Same as --host"
    echo "  PORT                    Same as --port"
    echo "  TIMEOUT                 Same as --timeout"
    echo "  MAX_RETRIES             Same as --retries"
    echo "  RETRY_DELAY             Same as --delay"
    echo "  CONTAINER_NAME          Same as --container"
    echo ""
    echo "Examples:"
    echo "  $0                                    # Check localhost:8000"
    echo "  $0 -h production.example.com -p 80   # Check production server"
    echo "  $0 -c unitforge-container             # Include container memory check"
}

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        -h|--host)
            HOST="$2"
            shift 2
            ;;
        -p|--port)
            PORT="$2"
            shift 2
            ;;
        -t|--timeout)
            TIMEOUT="$2"
            shift 2
            ;;
        -r|--retries)
            MAX_RETRIES="$2"
            shift 2
            ;;
        -d|--delay)
            RETRY_DELAY="$2"
            shift 2
            ;;
        -c|--container)
            CONTAINER_NAME="$2"
            shift 2
            ;;
        --help)
            usage
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            usage
            exit 1
            ;;
    esac
done

# Check dependencies
if ! command -v curl > /dev/null 2>&1; then
    log_error "curl is required but not installed"
    exit 1
fi

if ! command -v bc > /dev/null 2>&1; then
    log_warn "bc is not installed, performance timing may not work properly"
fi

# Run the health check
run_health_check
exit $?