#!/bin/bash # # Scheduler Monitoring Script # Checks service health, job status, and system resources # set -e # Configuration LOG_FILE="${LOG_FILE:-/var/log/scheduler-monitor.log}" DB_PATH="${DB_PATH:-/opt/scheduler/scheduler.db}" SERVICE_NAME="${SERVICE_NAME:-scheduler}" ALERT_EMAIL="${ALERT_EMAIL:-}" DISK_THRESHOLD=90 # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # No Color TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S') ALERTS=() echo "=== Scheduler Monitor - $TIMESTAMP ===" | tee -a "$LOG_FILE" # Check if service is running check_service() { if systemctl is-active --quiet "$SERVICE_NAME" 2>/dev/null; then echo -e "${GREEN}✓ Service is running${NC}" | tee -a "$LOG_FILE" return 0 elif docker ps --filter "name=movie_scheduler" --format "{{.Status}}" | grep -q "Up"; then echo -e "${GREEN}✓ Docker container is running${NC}" | tee -a "$LOG_FILE" return 0 else echo -e "${RED}✗ Service is DOWN${NC}" | tee -a "$LOG_FILE" ALERTS+=("Service is DOWN") # Try to restart echo "Attempting to restart service..." | tee -a "$LOG_FILE" systemctl start "$SERVICE_NAME" 2>/dev/null || \ docker compose -f /opt/scheduler/docker-compose.prod.yml restart 2>/dev/null || true return 1 fi } # Check database check_database() { if [ ! -f "$DB_PATH" ]; then echo -e "${RED}✗ Database not found${NC}" | tee -a "$LOG_FILE" ALERTS+=("Database file not found") return 1 fi # Database size DB_SIZE=$(du -h "$DB_PATH" | cut -f1) echo "Database size: $DB_SIZE" | tee -a "$LOG_FILE" # Job counts TOTAL=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs;" 2>/dev/null) PENDING_PREP=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs WHERE prep_status='pending';" 2>/dev/null) PENDING_PLAY=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs WHERE play_status='pending';" 2>/dev/null) STREAMING=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs WHERE play_status='streaming';" 2>/dev/null) FAILED=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs WHERE prep_status='failed' OR play_status='failed';" 2>/dev/null) echo "Jobs - Total: $TOTAL, Pending prep: $PENDING_PREP, Pending play: $PENDING_PLAY, Streaming: $STREAMING, Failed: $FAILED" | tee -a "$LOG_FILE" if [ "$FAILED" -gt 0 ]; then echo -e "${YELLOW}⚠ Found $FAILED failed job(s)${NC}" | tee -a "$LOG_FILE" ALERTS+=("$FAILED failed jobs") fi if [ "$STREAMING" -gt 0 ]; then echo -e "${GREEN}✓ $STREAMING active stream(s)${NC}" | tee -a "$LOG_FILE" fi } # Check disk space check_disk() { for PATH in "/mnt/storage" "/opt/scheduler" "/"; do if [ -d "$PATH" ]; then USAGE=$(df "$PATH" 2>/dev/null | tail -1 | awk '{print $5}' | sed 's/%//') if [ -n "$USAGE" ]; then if [ "$USAGE" -gt "$DISK_THRESHOLD" ]; then echo -e "${RED}✗ Disk usage for $PATH: ${USAGE}%${NC}" | tee -a "$LOG_FILE" ALERTS+=("Disk usage ${USAGE}% on $PATH") else echo -e "Disk usage for $PATH: ${USAGE}%" | tee -a "$LOG_FILE" fi fi fi done } # Check for stuck streams check_stuck_streams() { # Find streams that have been active for more than 4 hours STUCK=$(sqlite3 "$DB_PATH" " SELECT COUNT(*) FROM jobs WHERE play_status='streaming' AND datetime(stream_start_time) < datetime('now', '-4 hours') " 2>/dev/null) if [ "$STUCK" -gt 0 ]; then echo -e "${YELLOW}⚠ Found $STUCK stream(s) active for >4 hours${NC}" | tee -a "$LOG_FILE" ALERTS+=("$STUCK potentially stuck streams") fi } # Check recent errors in logs check_logs() { if systemctl is-active --quiet "$SERVICE_NAME" 2>/dev/null; then ERROR_COUNT=$(journalctl -u "$SERVICE_NAME" --since "5 minutes ago" 2>/dev/null | grep -i "ERROR" | wc -l) else ERROR_COUNT=$(docker compose -f /opt/scheduler/docker-compose.prod.yml logs --since 5m 2>/dev/null | grep -i "ERROR" | wc -l || echo "0") fi if [ "$ERROR_COUNT" -gt 10 ]; then echo -e "${YELLOW}⚠ Found $ERROR_COUNT errors in last 5 minutes${NC}" | tee -a "$LOG_FILE" ALERTS+=("$ERROR_COUNT recent errors") fi } # Send alerts send_alerts() { if [ ${#ALERTS[@]} -gt 0 ]; then echo "" | tee -a "$LOG_FILE" echo -e "${RED}=== ALERTS ===${NC}" | tee -a "$LOG_FILE" for alert in "${ALERTS[@]}"; do echo "- $alert" | tee -a "$LOG_FILE" done # Send email if configured if [ -n "$ALERT_EMAIL" ] && command -v mail &> /dev/null; then { echo "Scheduler Monitoring Alert - $TIMESTAMP" echo "" echo "The following issues were detected:" for alert in "${ALERTS[@]}"; do echo "- $alert" done echo "" echo "Check $LOG_FILE for details" } | mail -s "Scheduler Alert" "$ALERT_EMAIL" fi # Could also send to Slack, PagerDuty, etc. # Example Slack webhook: # if [ -n "$SLACK_WEBHOOK" ]; then # curl -X POST -H 'Content-type: application/json' \ # --data "{\"text\":\"Scheduler Alert: ${ALERTS[*]}\"}" \ # "$SLACK_WEBHOOK" # fi fi } # Run checks check_service check_database check_disk check_stuck_streams check_logs send_alerts echo "" | tee -a "$LOG_FILE" # Exit with error if there are alerts if [ ${#ALERTS[@]} -gt 0 ]; then exit 1 fi exit 0