openclaw/scripts/health-check-gateway.sh
valtterimelkko eec556c71e Fix: Resolve gateway crash loop and inotify exhaustion
Problem: Gateway was hung in 1200+ restart loop, causing Telegram bot to stop
responding. Root cause: system inotify file descriptor limit exhausted when
monitoring config/skill files.

Solutions implemented:

1. **Inotify limit increase** (/etc/sysctl.d/99-moltbot-inotify.conf)
   - Increased fs.inotify.max_user_watches from 65536 to 524288
   - Prevents "ENOSPC: System limit for number of file watchers reached"
   - Persistent across reboots

2. **Improved systemd service** (/etc/systemd/system/moltbot-gateway.service)
   - Changed Restart=always → Restart=on-failure
   - Increased RestartSec=5 → RestartSec=10 (reduce CPU churn)
   - Reduced StartLimitBurst=10 → StartLimitBurst=5
   - Added ExecStartPre to auto-clean stale locks on startup
   - Service remains isolated from other services (code-server, ssh, etc)

3. **Health check automation** (new files)
   - scripts/health-check-gateway.sh: detects hang/lock issues, auto-recovers
   - /etc/systemd/system/moltbot-health-check.service: runs health checks
   - /etc/systemd/system/moltbot-health-check.timer: runs every 5 minutes
   - Logs to /tmp/moltbot-health-check.log

4. **Documentation** (README_Tech.md)
   - Added section on crash loop root cause and preventative measures
   - Added Architecture section documenting service isolation
   - Updated troubleshooting with health check steps
   - Updated file locations with new monitoring files

Testing: Gateway now starts cleanly, health checks pass, other services
(code-server, ssh) remain unaffected. Timer runs every 5 minutes to prevent
future hangs.

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2026-01-29 18:55:41 +00:00

110 lines
2.8 KiB
Bash
Executable File

#!/bin/bash
# Moltbot Gateway Health Check and Recovery Script
# Monitors gateway health, detects hangs, and initiates recovery
# Designed to run as a cronjob or systemd timer (not interfering with other services)
set -e
GATEWAY_PORT=18789
GATEWAY_HOST="127.0.0.1"
GATEWAY_WS="ws://${GATEWAY_HOST}:${GATEWAY_PORT}"
HEALTH_CHECK_TIMEOUT=10
MAX_LOCK_AGE=600 # 10 minutes in seconds
LOCK_FILES=(
~/.clawdbot/gateway.lock
~/.clawdbot/moltbot.lock
/tmp/moltbot-gateway.lock
)
LOG_FILE="/tmp/moltbot-health-check.log"
# Logging function
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
# Check if gateway process is responding
check_gateway_responsive() {
# Try to connect to gateway port
if timeout 3 bash -c "echo > /dev/tcp/${GATEWAY_HOST}/${GATEWAY_PORT}" 2>/dev/null; then
return 0 # Gateway is responding
else
return 1 # Gateway is not responding
fi
}
# Check for stale lock files
check_stale_locks() {
for lock_file in "${LOCK_FILES[@]}"; do
if [ -f "$lock_file" ]; then
file_age=$(($(date +%s) - $(stat -f%m "$lock_file" 2>/dev/null || stat -c%Y "$lock_file" 2>/dev/null)))
if [ "$file_age" -gt "$MAX_LOCK_AGE" ]; then
log "WARN: Stale lock file found: $lock_file (age: ${file_age}s)"
return 1 # Stale lock detected
fi
fi
done
return 0 # No stale locks
}
# Check if gateway is in crash loop
check_crash_loop() {
# Get restart count from systemd
restart_count=$(systemctl show moltbot-gateway.service -p NRestarts --value 2>/dev/null || echo "0")
if [ "$restart_count" -gt "10" ]; then
log "WARN: Gateway in potential crash loop (restart count: $restart_count)"
return 1
fi
return 0
}
# Clean stale lock files
cleanup_locks() {
log "Cleaning stale lock files..."
for lock_file in "${LOCK_FILES[@]}"; do
if [ -f "$lock_file" ]; then
rm -f "$lock_file" 2>/dev/null && log "Removed: $lock_file"
fi
done
}
# Graceful restart of gateway
restart_gateway() {
log "Initiating graceful gateway restart..."
systemctl restart moltbot-gateway.service
sleep 5
if check_gateway_responsive; then
log "Gateway restarted successfully"
return 0
else
log "ERROR: Gateway failed to respond after restart"
return 1
fi
}
# Main health check
main() {
log "Starting gateway health check..."
# Check if gateway is responsive
if ! check_gateway_responsive; then
log "ERROR: Gateway is not responding on port $GATEWAY_PORT"
# Check for stale locks or crash loop
if ! check_stale_locks || ! check_crash_loop; then
log "Detected lock/crash issue. Cleaning and restarting..."
cleanup_locks
restart_gateway
else
log "ERROR: Gateway unresponsive but no recovery needed. Manual intervention required."
exit 1
fi
else
log "Gateway is healthy and responsive"
return 0
fi
}
# Run health check
main