174 lines
5.6 KiB
Bash
Executable File
174 lines
5.6 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# Scheduler Monitoring Script
|
|
# Checks service health, job status, and system resources
|
|
#
|
|
|
|
set -e
|
|
|
|
# Configuration
|
|
LOG_FILE="${LOG_FILE:-/var/log/scheduler-monitor.log}"
|
|
DB_PATH="${DB_PATH:-/opt/scheduler/scheduler.db}"
|
|
SERVICE_NAME="${SERVICE_NAME:-scheduler}"
|
|
ALERT_EMAIL="${ALERT_EMAIL:-}"
|
|
DISK_THRESHOLD=90
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
NC='\033[0m' # No Color
|
|
|
|
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
|
|
ALERTS=()
|
|
|
|
echo "=== Scheduler Monitor - $TIMESTAMP ===" | tee -a "$LOG_FILE"
|
|
|
|
# Check if service is running
|
|
check_service() {
|
|
if systemctl is-active --quiet "$SERVICE_NAME" 2>/dev/null; then
|
|
echo -e "${GREEN}✓ Service is running${NC}" | tee -a "$LOG_FILE"
|
|
return 0
|
|
elif docker ps --filter "name=movie_scheduler" --format "{{.Status}}" | grep -q "Up"; then
|
|
echo -e "${GREEN}✓ Docker container is running${NC}" | tee -a "$LOG_FILE"
|
|
return 0
|
|
else
|
|
echo -e "${RED}✗ Service is DOWN${NC}" | tee -a "$LOG_FILE"
|
|
ALERTS+=("Service is DOWN")
|
|
|
|
# Try to restart
|
|
echo "Attempting to restart service..." | tee -a "$LOG_FILE"
|
|
systemctl start "$SERVICE_NAME" 2>/dev/null || \
|
|
docker compose -f /opt/scheduler/docker-compose.prod.yml restart 2>/dev/null || true
|
|
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Check database
|
|
check_database() {
|
|
if [ ! -f "$DB_PATH" ]; then
|
|
echo -e "${RED}✗ Database not found${NC}" | tee -a "$LOG_FILE"
|
|
ALERTS+=("Database file not found")
|
|
return 1
|
|
fi
|
|
|
|
# Database size
|
|
DB_SIZE=$(du -h "$DB_PATH" | cut -f1)
|
|
echo "Database size: $DB_SIZE" | tee -a "$LOG_FILE"
|
|
|
|
# Job counts
|
|
TOTAL=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs;" 2>/dev/null)
|
|
PENDING_PREP=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs WHERE prep_status='pending';" 2>/dev/null)
|
|
PENDING_PLAY=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs WHERE play_status='pending';" 2>/dev/null)
|
|
STREAMING=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs WHERE play_status='streaming';" 2>/dev/null)
|
|
FAILED=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM jobs WHERE prep_status='failed' OR play_status='failed';" 2>/dev/null)
|
|
|
|
echo "Jobs - Total: $TOTAL, Pending prep: $PENDING_PREP, Pending play: $PENDING_PLAY, Streaming: $STREAMING, Failed: $FAILED" | tee -a "$LOG_FILE"
|
|
|
|
if [ "$FAILED" -gt 0 ]; then
|
|
echo -e "${YELLOW}⚠ Found $FAILED failed job(s)${NC}" | tee -a "$LOG_FILE"
|
|
ALERTS+=("$FAILED failed jobs")
|
|
fi
|
|
|
|
if [ "$STREAMING" -gt 0 ]; then
|
|
echo -e "${GREEN}✓ $STREAMING active stream(s)${NC}" | tee -a "$LOG_FILE"
|
|
fi
|
|
}
|
|
|
|
# Check disk space
|
|
check_disk() {
|
|
for PATH in "/mnt/storage" "/opt/scheduler" "/"; do
|
|
if [ -d "$PATH" ]; then
|
|
USAGE=$(df "$PATH" 2>/dev/null | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
if [ -n "$USAGE" ]; then
|
|
if [ "$USAGE" -gt "$DISK_THRESHOLD" ]; then
|
|
echo -e "${RED}✗ Disk usage for $PATH: ${USAGE}%${NC}" | tee -a "$LOG_FILE"
|
|
ALERTS+=("Disk usage ${USAGE}% on $PATH")
|
|
else
|
|
echo -e "Disk usage for $PATH: ${USAGE}%" | tee -a "$LOG_FILE"
|
|
fi
|
|
fi
|
|
fi
|
|
done
|
|
}
|
|
|
|
# Check for stuck streams
|
|
check_stuck_streams() {
|
|
# Find streams that have been active for more than 4 hours
|
|
STUCK=$(sqlite3 "$DB_PATH" "
|
|
SELECT COUNT(*) FROM jobs
|
|
WHERE play_status='streaming'
|
|
AND datetime(stream_start_time) < datetime('now', '-4 hours')
|
|
" 2>/dev/null)
|
|
|
|
if [ "$STUCK" -gt 0 ]; then
|
|
echo -e "${YELLOW}⚠ Found $STUCK stream(s) active for >4 hours${NC}" | tee -a "$LOG_FILE"
|
|
ALERTS+=("$STUCK potentially stuck streams")
|
|
fi
|
|
}
|
|
|
|
# Check recent errors in logs
|
|
check_logs() {
|
|
if systemctl is-active --quiet "$SERVICE_NAME" 2>/dev/null; then
|
|
ERROR_COUNT=$(journalctl -u "$SERVICE_NAME" --since "5 minutes ago" 2>/dev/null | grep -i "ERROR" | wc -l)
|
|
else
|
|
ERROR_COUNT=$(docker compose -f /opt/scheduler/docker-compose.prod.yml logs --since 5m 2>/dev/null | grep -i "ERROR" | wc -l || echo "0")
|
|
fi
|
|
|
|
if [ "$ERROR_COUNT" -gt 10 ]; then
|
|
echo -e "${YELLOW}⚠ Found $ERROR_COUNT errors in last 5 minutes${NC}" | tee -a "$LOG_FILE"
|
|
ALERTS+=("$ERROR_COUNT recent errors")
|
|
fi
|
|
}
|
|
|
|
# Send alerts
|
|
send_alerts() {
|
|
if [ ${#ALERTS[@]} -gt 0 ]; then
|
|
echo "" | tee -a "$LOG_FILE"
|
|
echo -e "${RED}=== ALERTS ===${NC}" | tee -a "$LOG_FILE"
|
|
for alert in "${ALERTS[@]}"; do
|
|
echo "- $alert" | tee -a "$LOG_FILE"
|
|
done
|
|
|
|
# Send email if configured
|
|
if [ -n "$ALERT_EMAIL" ] && command -v mail &> /dev/null; then
|
|
{
|
|
echo "Scheduler Monitoring Alert - $TIMESTAMP"
|
|
echo ""
|
|
echo "The following issues were detected:"
|
|
for alert in "${ALERTS[@]}"; do
|
|
echo "- $alert"
|
|
done
|
|
echo ""
|
|
echo "Check $LOG_FILE for details"
|
|
} | mail -s "Scheduler Alert" "$ALERT_EMAIL"
|
|
fi
|
|
|
|
# Could also send to Slack, PagerDuty, etc.
|
|
# Example Slack webhook:
|
|
# if [ -n "$SLACK_WEBHOOK" ]; then
|
|
# curl -X POST -H 'Content-type: application/json' \
|
|
# --data "{\"text\":\"Scheduler Alert: ${ALERTS[*]}\"}" \
|
|
# "$SLACK_WEBHOOK"
|
|
# fi
|
|
fi
|
|
}
|
|
|
|
# Run checks
|
|
check_service
|
|
check_database
|
|
check_disk
|
|
check_stuck_streams
|
|
check_logs
|
|
send_alerts
|
|
|
|
echo "" | tee -a "$LOG_FILE"
|
|
|
|
# Exit with error if there are alerts
|
|
if [ ${#ALERTS[@]} -gt 0 ]; then
|
|
exit 1
|
|
fi
|
|
|
|
exit 0
|