Implement graceful shutdown for MQTT connection failures

- Added connection verification with timeout-based validation on startup
- Enhanced error handling to catch Tortoise.publish_sync exceptions
- Graceful exit via System.stop(1) prevents erl_crash.dump files
- Added comprehensive logging for connection failures and shutdown reasons
- Updated CLAUDE.md to document graceful shutdown behavior

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
ryan 2025-08-09 19:21:57 -07:00
parent f965bf5802
commit 1b58dfad31
2 changed files with 29 additions and 10 deletions

View File

@ -74,6 +74,8 @@ This is an Elixir OTP application that serves as a systemd daemon for MQTT-based
- Uses hostname-based randomized client ID to avoid conflicts across multiple hosts
- Configurable startup delay (default 5 seconds) before first metrics publish
- Real-time metrics collection with configurable intervals
- **Connection verification**: Tests MQTT connectivity on startup with timeout-based validation
- **Graceful shutdown**: Exits cleanly via `System.stop(1)` when MQTT broker unavailable (prevents crash dumps)
### Configuration System
Systant uses a TOML-based configuration system with environment variable overrides:

View File

@ -57,10 +57,14 @@ defmodule Systant.MqttClient do
:timeout ->
Logger.error("MQTT connection timeout - broker at #{mqtt_config.host}:#{mqtt_config.port} is not responding")
Logger.error("Shutting down systant due to MQTT connection failure")
System.stop(1)
{:stop, :connection_timeout}
{:error, reason} ->
Logger.error("MQTT connection verification failed: #{inspect(reason)}")
Logger.error("Shutting down systant due to MQTT connection failure")
System.stop(1)
{:stop, reason}
end
@ -151,16 +155,29 @@ defmodule Systant.MqttClient do
test_topic = "systant/connection_test"
test_payload = "test"
case Tortoise.publish_sync(client_id, test_topic, test_payload, qos: 0, timeout: timeout_ms) do
:ok ->
Logger.debug("MQTT connection test successful")
:ok
{:error, :timeout} ->
Logger.error("MQTT connection test timed out")
:timeout
{:error, reason} ->
Logger.error("MQTT connection test failed: #{inspect(reason)}")
{:error, reason}
try do
case Tortoise.publish_sync(client_id, test_topic, test_payload, qos: 0, timeout: timeout_ms) do
:ok ->
Logger.debug("MQTT connection test successful")
:ok
{:error, :timeout} ->
Logger.error("MQTT connection test timed out")
:timeout
{:error, reason} ->
Logger.error("MQTT connection test failed: #{inspect(reason)}")
{:error, reason}
other ->
Logger.error("MQTT connection test unexpected result: #{inspect(other)}")
{:error, other}
end
rescue
error ->
Logger.error("MQTT connection test exception: #{inspect(error)}")
{:error, :connection_failed}
catch
:exit, reason ->
Logger.error("MQTT connection test exit: #{inspect(reason)}")
{:error, :connection_failed}
end
end
end