Implement graceful shutdown for MQTT connection failures

- Added connection verification with timeout-based validation on startup
- Enhanced error handling to catch Tortoise.publish_sync exceptions
- Graceful exit via System.stop(1) prevents erl_crash.dump files
- Added comprehensive logging for connection failures and shutdown reasons
- Updated CLAUDE.md to document graceful shutdown behavior

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
ryan 2025-08-09 19:21:57 -07:00
parent f965bf5802
commit 1b58dfad31
2 changed files with 29 additions and 10 deletions

View File

@ -74,6 +74,8 @@ This is an Elixir OTP application that serves as a systemd daemon for MQTT-based
- Uses hostname-based randomized client ID to avoid conflicts across multiple hosts - Uses hostname-based randomized client ID to avoid conflicts across multiple hosts
- Configurable startup delay (default 5 seconds) before first metrics publish - Configurable startup delay (default 5 seconds) before first metrics publish
- Real-time metrics collection with configurable intervals - Real-time metrics collection with configurable intervals
- **Connection verification**: Tests MQTT connectivity on startup with timeout-based validation
- **Graceful shutdown**: Exits cleanly via `System.stop(1)` when MQTT broker unavailable (prevents crash dumps)
### Configuration System ### Configuration System
Systant uses a TOML-based configuration system with environment variable overrides: Systant uses a TOML-based configuration system with environment variable overrides:

View File

@ -57,10 +57,14 @@ defmodule Systant.MqttClient do
:timeout -> :timeout ->
Logger.error("MQTT connection timeout - broker at #{mqtt_config.host}:#{mqtt_config.port} is not responding") Logger.error("MQTT connection timeout - broker at #{mqtt_config.host}:#{mqtt_config.port} is not responding")
Logger.error("Shutting down systant due to MQTT connection failure")
System.stop(1)
{:stop, :connection_timeout} {:stop, :connection_timeout}
{:error, reason} -> {:error, reason} ->
Logger.error("MQTT connection verification failed: #{inspect(reason)}") Logger.error("MQTT connection verification failed: #{inspect(reason)}")
Logger.error("Shutting down systant due to MQTT connection failure")
System.stop(1)
{:stop, reason} {:stop, reason}
end end
@ -151,16 +155,29 @@ defmodule Systant.MqttClient do
test_topic = "systant/connection_test" test_topic = "systant/connection_test"
test_payload = "test" test_payload = "test"
case Tortoise.publish_sync(client_id, test_topic, test_payload, qos: 0, timeout: timeout_ms) do try do
:ok -> case Tortoise.publish_sync(client_id, test_topic, test_payload, qos: 0, timeout: timeout_ms) do
Logger.debug("MQTT connection test successful") :ok ->
:ok Logger.debug("MQTT connection test successful")
{:error, :timeout} -> :ok
Logger.error("MQTT connection test timed out") {:error, :timeout} ->
:timeout Logger.error("MQTT connection test timed out")
{:error, reason} -> :timeout
Logger.error("MQTT connection test failed: #{inspect(reason)}") {:error, reason} ->
{:error, reason} Logger.error("MQTT connection test failed: #{inspect(reason)}")
{:error, reason}
other ->
Logger.error("MQTT connection test unexpected result: #{inspect(other)}")
{:error, other}
end
rescue
error ->
Logger.error("MQTT connection test exception: #{inspect(error)}")
{:error, :connection_failed}
catch
:exit, reason ->
Logger.error("MQTT connection test exit: #{inspect(reason)}")
{:error, :connection_failed}
end end
end end
end end