autobroadcaster/scripts/monitor.sh

174 lines
5.6 KiB
Bash
Executable File

#!/bin/bash
#
# Scheduler Monitoring Script
# Checks service health, job status, and system resources
#
set -e
# Configuration
LOG_FILE="${LOG_FILE:-/var/log/scheduler-monitor.log}"
DB_PATH="${DB_PATH:-/opt/scheduler/scheduler.db}"
SERVICE_NAME="${SERVICE_NAME:-scheduler}"
ALERT_EMAIL="${ALERT_EMAIL:-}"
DISK_THRESHOLD=90
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
ALERTS=()
echo "=== Scheduler Monitor - $TIMESTAMP ===" | tee -a "$LOG_FILE"
# Check if service is running
check_service() {
if systemctl is-active --quiet "$SERVICE_NAME" 2>/dev/null; then
echo -e "${GREEN}✓ Service is running${NC}" | tee -a "$LOG_FILE"
return 0
elif docker ps --filter "name=movie_scheduler" --format "{{.Status}}" | grep -q "Up"; then
echo -e "${GREEN}✓ Docker container is running${NC}" | tee -a "$LOG_FILE"
return 0
else
echo -e "${RED}✗ Service is DOWN${NC}" | tee -a "$LOG_FILE"
ALERTS+=("Service is DOWN")
# Try to restart
echo "Attempting to restart service..." | tee -a "$LOG_FILE"
systemctl start "$SERVICE_NAME" 2>/dev/null || \
docker compose -f /opt/scheduler/docker-compose.prod.yml restart 2>/dev/null || true
return 1
fi
}
# Check database
check_database() {
if [ ! -f "$DB_PATH" ]; then
echo -e "${RED}✗ Database not found${NC}" | tee -a "$LOG_FILE"
ALERTS+=("Database file not found")
return 1
fi
# Database size
DB_SIZE=$(du -h "$DB_PATH" | cut -f1)
echo "Database size: $DB_SIZE" | tee -a "$LOG_FILE"
# Job counts
TOTAL=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs;" 2>/dev/null)
PENDING_PREP=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs WHERE prep_status='pending';" 2>/dev/null)
PENDING_PLAY=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs WHERE play_status='pending';" 2>/dev/null)
STREAMING=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs WHERE play_status='streaming';" 2>/dev/null)
FAILED=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs WHERE prep_status='failed' OR play_status='failed';" 2>/dev/null)
echo "Jobs - Total: $TOTAL, Pending prep: $PENDING_PREP, Pending play: $PENDING_PLAY, Streaming: $STREAMING, Failed: $FAILED" | tee -a "$LOG_FILE"
if [ "$FAILED" -gt 0 ]; then
echo -e "${YELLOW}⚠ Found $FAILED failed job(s)${NC}" | tee -a "$LOG_FILE"
ALERTS+=("$FAILED failed jobs")
fi
if [ "$STREAMING" -gt 0 ]; then
echo -e "${GREEN}$STREAMING active stream(s)${NC}" | tee -a "$LOG_FILE"
fi
}
# Check disk space
check_disk() {
for PATH in "/mnt/storage" "/opt/scheduler" "/"; do
if [ -d "$PATH" ]; then
USAGE=$(df "$PATH" 2>/dev/null | tail -1 | awk '{print $5}' | sed 's/%//')
if [ -n "$USAGE" ]; then
if [ "$USAGE" -gt "$DISK_THRESHOLD" ]; then
echo -e "${RED}✗ Disk usage for $PATH: ${USAGE}%${NC}" | tee -a "$LOG_FILE"
ALERTS+=("Disk usage ${USAGE}% on $PATH")
else
echo -e "Disk usage for $PATH: ${USAGE}%" | tee -a "$LOG_FILE"
fi
fi
fi
done
}
# Check for stuck streams
check_stuck_streams() {
# Find streams that have been active for more than 4 hours
STUCK=$(sqlite3 "$DB_PATH" "
SELECT COUNT(*) FROM jobs
WHERE play_status='streaming'
AND datetime(stream_start_time) < datetime('now', '-4 hours')
" 2>/dev/null)
if [ "$STUCK" -gt 0 ]; then
echo -e "${YELLOW}⚠ Found $STUCK stream(s) active for >4 hours${NC}" | tee -a "$LOG_FILE"
ALERTS+=("$STUCK potentially stuck streams")
fi
}
# Check recent errors in logs
check_logs() {
if systemctl is-active --quiet "$SERVICE_NAME" 2>/dev/null; then
ERROR_COUNT=$(journalctl -u "$SERVICE_NAME" --since "5 minutes ago" 2>/dev/null | grep -i "ERROR" | wc -l)
else
ERROR_COUNT=$(docker compose -f /opt/scheduler/docker-compose.prod.yml logs --since 5m 2>/dev/null | grep -i "ERROR" | wc -l || echo "0")
fi
if [ "$ERROR_COUNT" -gt 10 ]; then
echo -e "${YELLOW}⚠ Found $ERROR_COUNT errors in last 5 minutes${NC}" | tee -a "$LOG_FILE"
ALERTS+=("$ERROR_COUNT recent errors")
fi
}
# Send alerts
send_alerts() {
if [ ${#ALERTS[@]} -gt 0 ]; then
echo "" | tee -a "$LOG_FILE"
echo -e "${RED}=== ALERTS ===${NC}" | tee -a "$LOG_FILE"
for alert in "${ALERTS[@]}"; do
echo "- $alert" | tee -a "$LOG_FILE"
done
# Send email if configured
if [ -n "$ALERT_EMAIL" ] && command -v mail &> /dev/null; then
{
echo "Scheduler Monitoring Alert - $TIMESTAMP"
echo ""
echo "The following issues were detected:"
for alert in "${ALERTS[@]}"; do
echo "- $alert"
done
echo ""
echo "Check $LOG_FILE for details"
} | mail -s "Scheduler Alert" "$ALERT_EMAIL"
fi
# Could also send to Slack, PagerDuty, etc.
# Example Slack webhook:
# if [ -n "$SLACK_WEBHOOK" ]; then
# curl -X POST -H 'Content-type: application/json' \
# --data "{\"text\":\"Scheduler Alert: ${ALERTS[*]}\"}" \
# "$SLACK_WEBHOOK"
# fi
fi
}
# Run checks
check_service
check_database
check_disk
check_stuck_streams
check_logs
send_alerts
echo "" | tee -a "$LOG_FILE"
# Exit with error if there are alerts
if [ ${#ALERTS[@]} -gt 0 ]; then
exit 1
fi
exit 0