Implement comprehensive TOML-based configuration system
- Add Systant.Config module for TOML configuration with environment overrides - Create systant.toml template with all metric module controls - Update SystemMetrics to use configuration-driven collection - Add filtering for disks, network interfaces, and processes - Implement per-module enable/disable controls - Update MqttClient to use new configuration system - Add Hivemind/Just development workflow integration - Update dashboard with graphical metrics display and raw data toggle - Comprehensive documentation updates in CLAUDE.md 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
eff32b3233
commit
b1cd085f6b
230
server/lib/systant/config.ex
Normal file
230
server/lib/systant/config.ex
Normal file
@ -0,0 +1,230 @@
|
|||||||
|
defmodule Systant.Config do
|
||||||
|
@moduledoc """
|
||||||
|
Configuration loader and manager for Systant.
|
||||||
|
|
||||||
|
Loads configuration from TOML files with environment variable overrides.
|
||||||
|
Provides a clean API for accessing configuration values throughout the application.
|
||||||
|
"""
|
||||||
|
|
||||||
|
require Logger
|
||||||
|
|
||||||
|
@default_config_paths [
|
||||||
|
"systant.toml", # Current directory
|
||||||
|
"~/.config/systant/systant.toml", # User config
|
||||||
|
"/etc/systant/systant.toml" # System config
|
||||||
|
]
|
||||||
|
|
||||||
|
@default_config %{
|
||||||
|
"general" => %{
|
||||||
|
"enabled_modules" => ["cpu", "memory", "disk", "gpu", "network", "temperature", "processes", "system"],
|
||||||
|
"collection_interval" => 30000,
|
||||||
|
"startup_delay" => 5000
|
||||||
|
},
|
||||||
|
"cpu" => %{"enabled" => true},
|
||||||
|
"memory" => %{"enabled" => true, "show_detailed" => true},
|
||||||
|
"disk" => %{
|
||||||
|
"enabled" => true,
|
||||||
|
"include_mounts" => [],
|
||||||
|
"exclude_mounts" => ["/snap", "/boot", "/dev", "/sys", "/proc", "/run", "/tmp"],
|
||||||
|
"exclude_types" => ["tmpfs", "devtmpfs", "squashfs", "overlay"],
|
||||||
|
"min_usage_percent" => 1
|
||||||
|
},
|
||||||
|
"gpu" => %{
|
||||||
|
"enabled" => true,
|
||||||
|
"nvidia_enabled" => true,
|
||||||
|
"amd_enabled" => true,
|
||||||
|
"max_gpus" => 8
|
||||||
|
},
|
||||||
|
"network" => %{
|
||||||
|
"enabled" => true,
|
||||||
|
"include_interfaces" => [],
|
||||||
|
"exclude_interfaces" => ["lo", "docker0", "br-", "veth", "virbr"],
|
||||||
|
"min_bytes_threshold" => 1024
|
||||||
|
},
|
||||||
|
"temperature" => %{
|
||||||
|
"enabled" => true,
|
||||||
|
"cpu_temp_enabled" => true,
|
||||||
|
"sensors_enabled" => true,
|
||||||
|
"temp_unit" => "celsius"
|
||||||
|
},
|
||||||
|
"processes" => %{
|
||||||
|
"enabled" => true,
|
||||||
|
"max_processes" => 10,
|
||||||
|
"sort_by" => "cpu",
|
||||||
|
"min_cpu_percent" => 0.1,
|
||||||
|
"min_memory_percent" => 0.1,
|
||||||
|
"max_command_length" => 50
|
||||||
|
},
|
||||||
|
"system" => %{
|
||||||
|
"enabled" => true,
|
||||||
|
"include_uptime" => true,
|
||||||
|
"include_load_average" => true,
|
||||||
|
"include_kernel_version" => true,
|
||||||
|
"include_os_info" => true
|
||||||
|
},
|
||||||
|
"mqtt" => %{
|
||||||
|
"host" => "mqtt.home",
|
||||||
|
"port" => 1883,
|
||||||
|
"client_id_prefix" => "systant",
|
||||||
|
"username" => "",
|
||||||
|
"password" => "",
|
||||||
|
"qos" => 0
|
||||||
|
},
|
||||||
|
"logging" => %{
|
||||||
|
"level" => "info",
|
||||||
|
"log_config_changes" => true,
|
||||||
|
"log_metric_collection" => false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Load configuration from TOML file with environment variable overrides.
|
||||||
|
Returns the merged configuration map.
|
||||||
|
"""
|
||||||
|
def load_config do
|
||||||
|
config =
|
||||||
|
@default_config
|
||||||
|
|> load_toml_config()
|
||||||
|
|> apply_env_overrides()
|
||||||
|
|> validate_config()
|
||||||
|
|
||||||
|
if get_in(config, ["logging", "log_config_changes"]) do
|
||||||
|
Logger.info("Systant configuration loaded successfully")
|
||||||
|
end
|
||||||
|
|
||||||
|
config
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Get a configuration value by path (e.g., ["disk", "enabled"] or "general.collection_interval")
|
||||||
|
"""
|
||||||
|
def get(config, path) when is_list(path) do
|
||||||
|
get_in(config, path)
|
||||||
|
end
|
||||||
|
|
||||||
|
def get(config, path) when is_binary(path) do
|
||||||
|
path_list = String.split(path, ".")
|
||||||
|
get_in(config, path_list)
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Check if a module is enabled in the configuration
|
||||||
|
"""
|
||||||
|
def module_enabled?(config, module_name) when is_binary(module_name) do
|
||||||
|
enabled_modules = get(config, ["general", "enabled_modules"]) || []
|
||||||
|
module_config = get(config, [module_name, "enabled"])
|
||||||
|
|
||||||
|
Enum.member?(enabled_modules, module_name) and module_config != false
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Get MQTT configuration with environment variable overrides
|
||||||
|
"""
|
||||||
|
def mqtt_config(config) do
|
||||||
|
mqtt_base = get(config, ["mqtt"]) || %{}
|
||||||
|
|
||||||
|
%{
|
||||||
|
host: System.get_env("MQTT_HOST") || mqtt_base["host"] || "mqtt.home",
|
||||||
|
port: parse_int(System.get_env("MQTT_PORT")) || mqtt_base["port"] || 1883,
|
||||||
|
client_id: generate_client_id(mqtt_base["client_id_prefix"] || "systant"),
|
||||||
|
username: System.get_env("MQTT_USERNAME") || mqtt_base["username"] || nil,
|
||||||
|
password: System.get_env("MQTT_PASSWORD") || mqtt_base["password"] || nil,
|
||||||
|
stats_topic: "systant/#{get_hostname()}/stats",
|
||||||
|
command_topic: "systant/#{get_hostname()}/commands",
|
||||||
|
publish_interval: get(config, ["general", "collection_interval"]) || 30000,
|
||||||
|
qos: mqtt_base["qos"] || 0
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
# Private functions
|
||||||
|
|
||||||
|
defp load_toml_config(default_config) do
|
||||||
|
config_file = find_config_file()
|
||||||
|
|
||||||
|
case config_file do
|
||||||
|
nil ->
|
||||||
|
Logger.info("No configuration file found, using defaults")
|
||||||
|
default_config
|
||||||
|
|
||||||
|
path ->
|
||||||
|
case File.read(path) do
|
||||||
|
{:ok, content} ->
|
||||||
|
case Toml.decode(content) do
|
||||||
|
{:ok, toml_config} ->
|
||||||
|
Logger.info("Loaded configuration from #{path}")
|
||||||
|
deep_merge(default_config, toml_config)
|
||||||
|
|
||||||
|
{:error, reason} ->
|
||||||
|
Logger.error("Failed to parse TOML config at #{path}: #{inspect(reason)}")
|
||||||
|
default_config
|
||||||
|
end
|
||||||
|
|
||||||
|
{:error, reason} ->
|
||||||
|
Logger.error("Failed to read config file #{path}: #{inspect(reason)}")
|
||||||
|
default_config
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp find_config_file do
|
||||||
|
@default_config_paths
|
||||||
|
|> Enum.map(&Path.expand/1)
|
||||||
|
|> Enum.find(&File.exists?/1)
|
||||||
|
end
|
||||||
|
|
||||||
|
defp apply_env_overrides(config) do
|
||||||
|
# Apply environment variable overrides for common settings
|
||||||
|
config
|
||||||
|
|> put_env_override(["general", "collection_interval"], "SYSTANT_INTERVAL", &parse_int/1)
|
||||||
|
|> put_env_override(["logging", "level"], "SYSTANT_LOG_LEVEL", &String.downcase/1)
|
||||||
|
|> put_env_override(["mqtt", "host"], "MQTT_HOST")
|
||||||
|
|> put_env_override(["mqtt", "port"], "MQTT_PORT", &parse_int/1)
|
||||||
|
end
|
||||||
|
|
||||||
|
defp put_env_override(config, path, env_var, transform \\ &(&1)) do
|
||||||
|
case System.get_env(env_var) do
|
||||||
|
nil -> config
|
||||||
|
value ->
|
||||||
|
transformed_value = transform.(value)
|
||||||
|
put_in(config, path, transformed_value)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp validate_config(config) do
|
||||||
|
# Basic validation - could be expanded
|
||||||
|
collection_interval = get(config, ["general", "collection_interval"])
|
||||||
|
|
||||||
|
if collection_interval && collection_interval < 1000 do
|
||||||
|
Logger.warning("Collection interval #{collection_interval}ms is very low, consider >= 1000ms")
|
||||||
|
end
|
||||||
|
|
||||||
|
config
|
||||||
|
end
|
||||||
|
|
||||||
|
defp deep_merge(left, right) when is_map(left) and is_map(right) do
|
||||||
|
Map.merge(left, right, fn _key, left_val, right_val ->
|
||||||
|
deep_merge(left_val, right_val)
|
||||||
|
end)
|
||||||
|
end
|
||||||
|
defp deep_merge(_left, right), do: right
|
||||||
|
|
||||||
|
defp parse_int(str) when is_binary(str) do
|
||||||
|
case Integer.parse(str) do
|
||||||
|
{int, _} -> int
|
||||||
|
_ -> nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
defp parse_int(int) when is_integer(int), do: int
|
||||||
|
defp parse_int(_), do: nil
|
||||||
|
|
||||||
|
defp generate_client_id(prefix) do
|
||||||
|
"#{prefix}_#{get_hostname()}_#{:rand.uniform(1000)}"
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_hostname do
|
||||||
|
case :inet.gethostname() do
|
||||||
|
{:ok, hostname} -> List.to_string(hostname)
|
||||||
|
_ -> "unknown"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
@ -11,31 +11,38 @@ defmodule Systant.MqttClient do
|
|||||||
end
|
end
|
||||||
|
|
||||||
def init(_opts) do
|
def init(_opts) do
|
||||||
config = Application.get_env(:systant, __MODULE__)
|
# Load the TOML-based configuration
|
||||||
Logger.info("Starting MQTT client with config: #{inspect(config)}")
|
app_config = Systant.Config.load_config()
|
||||||
|
mqtt_config = Systant.Config.mqtt_config(app_config)
|
||||||
|
|
||||||
# Use a unique client ID to avoid conflicts
|
Logger.info("Starting MQTT client with config: #{inspect(mqtt_config)}")
|
||||||
client_id = "#{config[:client_id]}_#{:rand.uniform(1000)}"
|
|
||||||
|
# Store both configs for later use
|
||||||
|
state_config = %{
|
||||||
|
app_config: app_config,
|
||||||
|
mqtt_config: mqtt_config
|
||||||
|
}
|
||||||
|
|
||||||
connection_opts = [
|
connection_opts = [
|
||||||
client_id: client_id,
|
client_id: mqtt_config.client_id,
|
||||||
server: {Tortoise.Transport.Tcp, host: to_charlist(config[:host]), port: config[:port]},
|
server: {Tortoise.Transport.Tcp, host: to_charlist(mqtt_config.host), port: mqtt_config.port},
|
||||||
handler: {Tortoise.Handler.Logger, []},
|
handler: {Tortoise.Handler.Logger, []},
|
||||||
user_name: config[:username],
|
user_name: mqtt_config.username,
|
||||||
password: config[:password],
|
password: mqtt_config.password,
|
||||||
subscriptions: [{config[:command_topic], 0}]
|
subscriptions: [{mqtt_config.command_topic, mqtt_config.qos}]
|
||||||
]
|
]
|
||||||
|
|
||||||
case Tortoise.Connection.start_link(connection_opts) do
|
case Tortoise.Connection.start_link(connection_opts) do
|
||||||
{:ok, _pid} ->
|
{:ok, _pid} ->
|
||||||
Logger.info("MQTT client connected successfully")
|
Logger.info("MQTT client connected successfully")
|
||||||
|
|
||||||
# Send immediate system metrics on startup
|
# Send system metrics after a short delay to ensure dashboard is ready
|
||||||
startup_stats = Systant.SystemMetrics.collect_metrics()
|
startup_delay = Systant.Config.get(app_config, ["general", "startup_delay"]) || 5000
|
||||||
Tortoise.publish(client_id, config[:stats_topic], Jason.encode!(startup_stats), qos: 0)
|
Process.send_after(self(), :publish_startup_stats, startup_delay)
|
||||||
|
Logger.info("Will publish initial stats in #{startup_delay}ms")
|
||||||
|
|
||||||
schedule_stats_publish(config[:publish_interval])
|
schedule_stats_publish(mqtt_config.publish_interval)
|
||||||
{:ok, %{config: config, client_id: client_id}}
|
{:ok, state_config}
|
||||||
|
|
||||||
{:error, reason} ->
|
{:error, reason} ->
|
||||||
Logger.error("Failed to connect to MQTT broker: #{inspect(reason)}")
|
Logger.error("Failed to connect to MQTT broker: #{inspect(reason)}")
|
||||||
@ -43,9 +50,15 @@ defmodule Systant.MqttClient do
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def handle_info(:publish_startup_stats, state) do
|
||||||
|
Logger.info("Publishing initial system metrics")
|
||||||
|
publish_stats(state.app_config, state.mqtt_config)
|
||||||
|
{:noreply, state}
|
||||||
|
end
|
||||||
|
|
||||||
def handle_info(:publish_stats, state) do
|
def handle_info(:publish_stats, state) do
|
||||||
publish_stats(state.config, state.client_id)
|
publish_stats(state.app_config, state.mqtt_config)
|
||||||
schedule_stats_publish(state.config[:publish_interval])
|
schedule_stats_publish(state.mqtt_config.publish_interval)
|
||||||
{:noreply, state}
|
{:noreply, state}
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -58,12 +71,12 @@ defmodule Systant.MqttClient do
|
|||||||
:ok
|
:ok
|
||||||
end
|
end
|
||||||
|
|
||||||
defp publish_stats(config, client_id) do
|
defp publish_stats(app_config, mqtt_config) do
|
||||||
stats = Systant.SystemMetrics.collect_metrics()
|
stats = Systant.SystemMetrics.collect_metrics(app_config)
|
||||||
|
|
||||||
payload = Jason.encode!(stats)
|
payload = Jason.encode!(stats)
|
||||||
|
|
||||||
case Tortoise.publish(client_id, config[:stats_topic], payload, qos: 0) do
|
case Tortoise.publish(mqtt_config.client_id, mqtt_config.stats_topic, payload, qos: mqtt_config.qos) do
|
||||||
:ok ->
|
:ok ->
|
||||||
Logger.info("Published system metrics for #{stats.hostname}")
|
Logger.info("Published system metrics for #{stats.hostname}")
|
||||||
{:error, reason} ->
|
{:error, reason} ->
|
||||||
|
|||||||
@ -7,140 +7,109 @@ defmodule Systant.SystemMetrics do
|
|||||||
require Logger
|
require Logger
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Collect all available system metrics
|
Collect system metrics based on configuration
|
||||||
"""
|
"""
|
||||||
def collect_metrics do
|
def collect_metrics(config \\ nil) do
|
||||||
%{
|
config = config || Systant.Config.load_config()
|
||||||
|
|
||||||
|
base_metrics = %{
|
||||||
timestamp: DateTime.utc_now() |> DateTime.to_iso8601(),
|
timestamp: DateTime.utc_now() |> DateTime.to_iso8601(),
|
||||||
hostname: get_hostname(),
|
hostname: get_hostname()
|
||||||
cpu: collect_cpu_metrics(),
|
}
|
||||||
memory: collect_memory_metrics(),
|
|
||||||
disk: collect_disk_metrics(),
|
# Collect metrics based on enabled modules
|
||||||
system: collect_system_info(),
|
enabled_modules = Systant.Config.get(config, ["general", "enabled_modules"]) || []
|
||||||
alarms: collect_active_alarms()
|
|
||||||
|
Enum.reduce(enabled_modules, base_metrics, fn module_name, acc ->
|
||||||
|
if Systant.Config.module_enabled?(config, module_name) do
|
||||||
|
case module_name do
|
||||||
|
"cpu" -> Map.put(acc, :cpu, collect_cpu_metrics(config))
|
||||||
|
"memory" -> Map.put(acc, :memory, collect_memory_metrics(config))
|
||||||
|
"disk" -> Map.put(acc, :disk, collect_disk_metrics(config))
|
||||||
|
"gpu" -> Map.put(acc, :gpu, collect_gpu_metrics(config))
|
||||||
|
"network" -> Map.put(acc, :network, collect_network_metrics(config))
|
||||||
|
"temperature" -> Map.put(acc, :temperature, collect_temperature_metrics(config))
|
||||||
|
"processes" -> Map.put(acc, :processes, collect_process_metrics(config))
|
||||||
|
"system" -> Map.put(acc, :system, collect_system_info(config))
|
||||||
|
_ -> acc
|
||||||
|
end
|
||||||
|
else
|
||||||
|
acc
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Collect CPU metrics using Linux system files and commands
|
||||||
|
"""
|
||||||
|
def collect_cpu_metrics(_config) do
|
||||||
|
get_load_averages()
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Collect memory metrics using Linux /proc/meminfo
|
||||||
|
"""
|
||||||
|
def collect_memory_metrics(_config) do
|
||||||
|
get_memory_info()
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Collect disk metrics using Linux df command
|
||||||
|
"""
|
||||||
|
def collect_disk_metrics(config) do
|
||||||
|
get_disk_usage(config)
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Collect GPU metrics from NVIDIA and AMD GPUs
|
||||||
|
"""
|
||||||
|
def collect_gpu_metrics(config) do
|
||||||
|
gpu_config = Systant.Config.get(config, ["gpu"]) || %{}
|
||||||
|
|
||||||
|
%{
|
||||||
|
nvidia: if(gpu_config["nvidia_enabled"] != false, do: get_nvidia_gpu_info(config), else: []),
|
||||||
|
amd: if(gpu_config["amd_enabled"] != false, do: get_amd_gpu_info(config), else: [])
|
||||||
}
|
}
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Collect CPU metrics using cpu_sup
|
Collect network interface statistics
|
||||||
"""
|
"""
|
||||||
def collect_cpu_metrics do
|
def collect_network_metrics(config) do
|
||||||
try do
|
get_network_stats(config)
|
||||||
# Get CPU utilization (average over all cores)
|
|
||||||
cpu_util = case :cpu_sup.util() do
|
|
||||||
{:badrpc, _} -> nil
|
|
||||||
util when is_number(util) -> util
|
|
||||||
_ -> nil
|
|
||||||
end
|
|
||||||
|
|
||||||
# Get load averages (1, 5, 15 minutes)
|
|
||||||
load_avg = case :cpu_sup.avg1() do
|
|
||||||
{:badrpc, _} -> nil
|
|
||||||
avg when is_number(avg) ->
|
|
||||||
%{
|
|
||||||
avg1: avg,
|
|
||||||
avg5: safe_call(:cpu_sup, :avg5, []),
|
|
||||||
avg15: safe_call(:cpu_sup, :avg15, [])
|
|
||||||
}
|
|
||||||
_ -> nil
|
|
||||||
end
|
|
||||||
|
|
||||||
%{
|
|
||||||
utilization: cpu_util,
|
|
||||||
load_average: load_avg
|
|
||||||
}
|
|
||||||
rescue
|
|
||||||
_ ->
|
|
||||||
Logger.warning("CPU metrics collection failed")
|
|
||||||
%{utilization: nil, load_average: nil}
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Collect memory metrics using memsup
|
Collect temperature data from system sensors
|
||||||
"""
|
"""
|
||||||
def collect_memory_metrics do
|
def collect_temperature_metrics(config) do
|
||||||
try do
|
get_temperature_data(config)
|
||||||
# Get system memory data
|
|
||||||
system_memory = case :memsup.get_system_memory_data() do
|
|
||||||
{:badrpc, _} -> nil
|
|
||||||
data when is_list(data) -> Enum.into(data, %{})
|
|
||||||
_ -> nil
|
|
||||||
end
|
|
||||||
|
|
||||||
# Get memory check interval
|
|
||||||
check_interval = safe_call(:memsup, :get_check_interval, [])
|
|
||||||
|
|
||||||
%{
|
|
||||||
system: system_memory,
|
|
||||||
check_interval: check_interval
|
|
||||||
}
|
|
||||||
rescue
|
|
||||||
_ ->
|
|
||||||
Logger.warning("Memory metrics collection failed")
|
|
||||||
%{system: nil, check_interval: nil}
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Collect disk metrics using disksup
|
Collect top processes by CPU and memory usage
|
||||||
"""
|
"""
|
||||||
def collect_disk_metrics do
|
def collect_process_metrics(config) do
|
||||||
try do
|
get_top_processes(config)
|
||||||
# Get disk data for all disks
|
|
||||||
disks = case :disksup.get_disk_data() do
|
|
||||||
{:badrpc, _} -> []
|
|
||||||
data when is_list(data) ->
|
|
||||||
Enum.map(data, fn {path, kb_size, capacity} ->
|
|
||||||
%{
|
|
||||||
path: List.to_string(path),
|
|
||||||
size_kb: kb_size,
|
|
||||||
capacity_percent: capacity
|
|
||||||
}
|
|
||||||
end)
|
|
||||||
_ -> []
|
|
||||||
end
|
|
||||||
|
|
||||||
%{disks: disks}
|
|
||||||
rescue
|
|
||||||
_ ->
|
|
||||||
Logger.warning("Disk metrics collection failed")
|
|
||||||
%{disks: []}
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
|
||||||
Collect active system alarms from alarm_handler
|
|
||||||
"""
|
|
||||||
def collect_active_alarms do
|
|
||||||
try do
|
|
||||||
# Get all active alarms from the alarm handler
|
|
||||||
case :alarm_handler.get_alarms() do
|
|
||||||
alarms when is_list(alarms) ->
|
|
||||||
Enum.map(alarms, fn {alarm_id, alarm_desc} ->
|
|
||||||
format_alarm(alarm_id, alarm_desc)
|
|
||||||
end)
|
|
||||||
_ -> []
|
|
||||||
end
|
|
||||||
rescue
|
|
||||||
_ ->
|
|
||||||
Logger.warning("Alarm collection failed")
|
|
||||||
[]
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Collect general system information
|
Collect general system information
|
||||||
"""
|
"""
|
||||||
def collect_system_info do
|
def collect_system_info(config) do
|
||||||
|
system_config = Systant.Config.get(config, ["system"]) || %{}
|
||||||
try do
|
try do
|
||||||
%{
|
base_info = %{}
|
||||||
uptime_seconds: get_uptime(),
|
|
||||||
erlang_version: System.version(),
|
base_info
|
||||||
otp_release: System.otp_release(),
|
|> maybe_add(:uptime_seconds, get_uptime(), system_config["include_uptime"])
|
||||||
schedulers: System.schedulers(),
|
|> maybe_add(:erlang_version, System.version(), true)
|
||||||
logical_processors: System.schedulers_online()
|
|> maybe_add(:otp_release, System.otp_release(), true)
|
||||||
}
|
|> maybe_add(:schedulers, System.schedulers(), true)
|
||||||
|
|> maybe_add(:logical_processors, System.schedulers_online(), true)
|
||||||
|
|> maybe_add(:kernel_version, get_kernel_version(), system_config["include_kernel_version"])
|
||||||
|
|> maybe_add(:os_info, get_os_info(), system_config["include_os_info"])
|
||||||
rescue
|
rescue
|
||||||
_ ->
|
_ ->
|
||||||
Logger.warning("System info collection failed")
|
Logger.warning("System info collection failed")
|
||||||
@ -159,45 +128,670 @@ defmodule Systant.SystemMetrics do
|
|||||||
|
|
||||||
defp get_uptime do
|
defp get_uptime do
|
||||||
try do
|
try do
|
||||||
# Get system uptime in milliseconds, convert to seconds
|
# Get actual system uptime by reading /proc/uptime on Linux
|
||||||
:erlang.statistics(:wall_clock) |> elem(0) |> div(1000)
|
case File.read("/proc/uptime") do
|
||||||
|
{:ok, content} ->
|
||||||
|
content
|
||||||
|
|> String.trim()
|
||||||
|
|> String.split(" ")
|
||||||
|
|> List.first()
|
||||||
|
|> String.to_float()
|
||||||
|
|> trunc()
|
||||||
|
_ ->
|
||||||
|
# Fallback to Erlang VM uptime if /proc/uptime unavailable
|
||||||
|
:erlang.statistics(:wall_clock) |> elem(0) |> div(1000)
|
||||||
|
end
|
||||||
rescue
|
rescue
|
||||||
_ -> nil
|
_ -> nil
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp safe_call(module, function, args) do
|
|
||||||
|
# Linux system metrics implementation
|
||||||
|
|
||||||
|
defp get_load_averages do
|
||||||
try do
|
try do
|
||||||
apply(module, function, args)
|
case File.read("/proc/loadavg") do
|
||||||
catch
|
{:ok, content} ->
|
||||||
:exit, _ -> nil
|
[avg1, avg5, avg15 | _] = String.split(String.trim(content), " ")
|
||||||
:error, _ -> nil
|
%{
|
||||||
|
avg1: String.to_float(avg1),
|
||||||
|
avg5: String.to_float(avg5),
|
||||||
|
avg15: String.to_float(avg15)
|
||||||
|
}
|
||||||
|
_ -> nil
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> nil
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp format_alarm(alarm_id, _alarm_desc) do
|
|
||||||
case alarm_id do
|
defp get_memory_info do
|
||||||
{:disk_almost_full, path} when is_list(path) ->
|
try do
|
||||||
%{
|
case File.read("/proc/meminfo") do
|
||||||
severity: "warning",
|
{:ok, content} ->
|
||||||
path: List.to_string(path),
|
# Parse /proc/meminfo into a map
|
||||||
id: "disk_almost_full"
|
meminfo = content
|
||||||
}
|
|> String.split("\n")
|
||||||
{:system_memory_high_watermark, _} ->
|
|> Enum.reduce(%{}, fn line, acc ->
|
||||||
%{
|
case String.split(line, ":") do
|
||||||
severity: "critical",
|
[key, value] ->
|
||||||
id: "system_memory_high_watermark"
|
# Extract numeric value (remove "kB" suffix)
|
||||||
}
|
clean_value = value |> String.trim() |> String.replace(" kB", "")
|
||||||
atom when is_atom(atom) ->
|
case Integer.parse(clean_value) do
|
||||||
%{
|
{num, _} -> Map.put(acc, String.trim(key), num)
|
||||||
severity: "info",
|
_ -> acc
|
||||||
id: Atom.to_string(atom)
|
end
|
||||||
}
|
_ -> acc
|
||||||
_ ->
|
end
|
||||||
%{
|
end)
|
||||||
severity: "info",
|
|
||||||
id: inspect(alarm_id)
|
total = Map.get(meminfo, "MemTotal", 0)
|
||||||
}
|
available = Map.get(meminfo, "MemAvailable", 0)
|
||||||
|
free = Map.get(meminfo, "MemFree", 0)
|
||||||
|
used = total - available
|
||||||
|
|
||||||
|
%{
|
||||||
|
total_kb: total,
|
||||||
|
available_kb: available,
|
||||||
|
free_kb: free,
|
||||||
|
used_kb: used,
|
||||||
|
used_percent: if(total > 0, do: Float.round(used / total * 100.0, 2), else: 0)
|
||||||
|
}
|
||||||
|
_ -> nil
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> nil
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
defp get_disk_usage do
|
||||||
|
try do
|
||||||
|
# Use df command to get disk usage
|
||||||
|
case System.cmd("df", ["-h", "--exclude-type=tmpfs", "--exclude-type=devtmpfs"]) do
|
||||||
|
{output, 0} ->
|
||||||
|
disks = output
|
||||||
|
|> String.split("\n")
|
||||||
|
|> Enum.drop(1) # Skip header
|
||||||
|
|> Enum.filter(&(String.trim(&1) != ""))
|
||||||
|
|> Enum.map(fn line ->
|
||||||
|
case String.split(line) do
|
||||||
|
[filesystem, size, used, available, use_percent, mounted_on] ->
|
||||||
|
%{
|
||||||
|
filesystem: filesystem,
|
||||||
|
size: size,
|
||||||
|
used: used,
|
||||||
|
available: available,
|
||||||
|
use_percent: String.replace(use_percent, "%", "") |> parse_percentage(),
|
||||||
|
mounted_on: mounted_on
|
||||||
|
}
|
||||||
|
_ -> nil
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
|> Enum.filter(&(&1 != nil))
|
||||||
|
|
||||||
|
%{disks: disks}
|
||||||
|
_ -> %{disks: []}
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> %{disks: []}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp parse_percentage(str) do
|
||||||
|
case Integer.parse(str) do
|
||||||
|
{num, _} -> num
|
||||||
|
_ -> 0
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# GPU Metrics Implementation
|
||||||
|
defp get_nvidia_gpu_info do
|
||||||
|
try do
|
||||||
|
case System.cmd("nvidia-smi", ["--query-gpu=name,utilization.gpu,utilization.memory,temperature.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"]) do
|
||||||
|
{output, 0} ->
|
||||||
|
output
|
||||||
|
|> String.split("\n")
|
||||||
|
|> Enum.filter(&(String.trim(&1) != ""))
|
||||||
|
|> Enum.with_index()
|
||||||
|
|> Enum.map(fn {line, index} ->
|
||||||
|
case String.split(line, ", ") do
|
||||||
|
[name, gpu_util, mem_util, temp, mem_used, mem_total] ->
|
||||||
|
%{
|
||||||
|
id: index,
|
||||||
|
name: String.trim(name),
|
||||||
|
utilization_percent: parse_int(gpu_util),
|
||||||
|
memory_utilization_percent: parse_int(mem_util),
|
||||||
|
temperature_c: parse_int(temp),
|
||||||
|
memory_used_mb: parse_int(mem_used),
|
||||||
|
memory_total_mb: parse_int(mem_total)
|
||||||
|
}
|
||||||
|
_ -> nil
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
|> Enum.filter(&(&1 != nil))
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_amd_gpu_info do
|
||||||
|
try do
|
||||||
|
# Try to get AMD GPU info from sysfs or rocm-smi if available
|
||||||
|
case System.cmd("rocm-smi", ["--showuse", "--showtemp", "--showmemuse", "--csv"]) do
|
||||||
|
{output, 0} ->
|
||||||
|
parse_rocm_smi_output(output)
|
||||||
|
_ ->
|
||||||
|
# Fallback to sysfs for basic AMD GPU info
|
||||||
|
get_amd_sysfs_info()
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp parse_rocm_smi_output(output) do
|
||||||
|
try do
|
||||||
|
lines = String.split(output, "\n") |> Enum.filter(&(String.trim(&1) != ""))
|
||||||
|
|
||||||
|
case lines do
|
||||||
|
[header | data_lines] ->
|
||||||
|
# Parse header to get column positions
|
||||||
|
headers = String.split(header, ",")
|
||||||
|
|
||||||
|
data_lines
|
||||||
|
|> Enum.with_index()
|
||||||
|
|> Enum.map(fn {line, index} ->
|
||||||
|
values = String.split(line, ",")
|
||||||
|
|
||||||
|
# Create a map from headers to values
|
||||||
|
data_map = Enum.zip(headers, values) |> Enum.into(%{})
|
||||||
|
|
||||||
|
%{
|
||||||
|
id: index,
|
||||||
|
name: "AMD GPU #{Map.get(data_map, "device", "unknown")}",
|
||||||
|
utilization_percent: parse_int(Map.get(data_map, "GPU use (%)", "0")),
|
||||||
|
memory_utilization_percent: parse_int(Map.get(data_map, "GPU Memory Allocated (VRAM%)", "0")),
|
||||||
|
temperature_c: parse_float(Map.get(data_map, "Temperature (Sensor edge) (C)", "0")),
|
||||||
|
memory_used_mb: nil, # rocm-smi doesn't provide absolute memory values in this format
|
||||||
|
memory_total_mb: nil
|
||||||
|
}
|
||||||
|
end)
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_amd_sysfs_info do
|
||||||
|
# Basic AMD GPU detection via sysfs
|
||||||
|
try do
|
||||||
|
case File.ls("/sys/class/drm") do
|
||||||
|
{:ok, entries} ->
|
||||||
|
entries
|
||||||
|
|> Enum.filter(&String.starts_with?(&1, "card"))
|
||||||
|
|> Enum.take(4) # Limit to first 4 GPUs
|
||||||
|
|> Enum.with_index()
|
||||||
|
|> Enum.map(fn {card, index} ->
|
||||||
|
%{
|
||||||
|
id: index,
|
||||||
|
name: "AMD GPU #{card}",
|
||||||
|
utilization_percent: nil,
|
||||||
|
memory_utilization_percent: nil,
|
||||||
|
temperature_c: nil,
|
||||||
|
memory_used_mb: nil,
|
||||||
|
memory_total_mb: nil
|
||||||
|
}
|
||||||
|
end)
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Network Metrics Implementation
|
||||||
|
defp get_network_stats do
|
||||||
|
try do
|
||||||
|
case File.read("/proc/net/dev") do
|
||||||
|
{:ok, content} ->
|
||||||
|
content
|
||||||
|
|> String.split("\n")
|
||||||
|
|> Enum.drop(2) # Skip header lines
|
||||||
|
|> Enum.filter(&(String.trim(&1) != ""))
|
||||||
|
|> Enum.map(&parse_network_interface/1)
|
||||||
|
|> Enum.filter(&(&1 != nil))
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp parse_network_interface(line) do
|
||||||
|
case String.split(line, ":") do
|
||||||
|
[interface_part, stats_part] ->
|
||||||
|
interface = String.trim(interface_part)
|
||||||
|
stats = stats_part |> String.trim() |> String.split() |> Enum.map(&parse_int/1)
|
||||||
|
|
||||||
|
if length(stats) >= 16 do
|
||||||
|
[rx_bytes, rx_packets, rx_errs, rx_drop, _, _, _, _,
|
||||||
|
tx_bytes, tx_packets, tx_errs, tx_drop | _] = stats
|
||||||
|
|
||||||
|
%{
|
||||||
|
interface: interface,
|
||||||
|
rx_bytes: rx_bytes,
|
||||||
|
rx_packets: rx_packets,
|
||||||
|
rx_errors: rx_errs,
|
||||||
|
rx_dropped: rx_drop,
|
||||||
|
tx_bytes: tx_bytes,
|
||||||
|
tx_packets: tx_packets,
|
||||||
|
tx_errors: tx_errs,
|
||||||
|
tx_dropped: tx_drop
|
||||||
|
}
|
||||||
|
else
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
_ -> nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Temperature Metrics Implementation
|
||||||
|
defp get_temperature_data do
|
||||||
|
%{
|
||||||
|
cpu: get_cpu_temperature(),
|
||||||
|
sensors: get_lm_sensors_data()
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_cpu_temperature do
|
||||||
|
try do
|
||||||
|
# Try multiple common CPU temperature sources
|
||||||
|
cpu_temp_sources = [
|
||||||
|
"/sys/class/thermal/thermal_zone0/temp",
|
||||||
|
"/sys/class/thermal/thermal_zone1/temp",
|
||||||
|
"/sys/class/hwmon/hwmon0/temp1_input",
|
||||||
|
"/sys/class/hwmon/hwmon1/temp1_input"
|
||||||
|
]
|
||||||
|
|
||||||
|
Enum.find_value(cpu_temp_sources, fn path ->
|
||||||
|
case File.read(path) do
|
||||||
|
{:ok, content} ->
|
||||||
|
temp_millic = content |> String.trim() |> parse_int()
|
||||||
|
if temp_millic > 0, do: temp_millic / 1000.0, else: nil
|
||||||
|
_ -> nil
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
rescue
|
||||||
|
_ -> nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_lm_sensors_data do
|
||||||
|
try do
|
||||||
|
case System.cmd("sensors", ["-A", "-j"]) do
|
||||||
|
{output, 0} ->
|
||||||
|
case Jason.decode(output) do
|
||||||
|
{:ok, data} -> simplify_sensors_data(data)
|
||||||
|
_ -> %{}
|
||||||
|
end
|
||||||
|
_ -> %{}
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> %{}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp simplify_sensors_data(sensors_data) when is_map(sensors_data) do
|
||||||
|
sensors_data
|
||||||
|
|> Enum.reduce(%{}, fn {chip_name, chip_data}, acc ->
|
||||||
|
case chip_data do
|
||||||
|
chip_map when is_map(chip_map) ->
|
||||||
|
temps = extract_temperatures(chip_map)
|
||||||
|
if map_size(temps) > 0 do
|
||||||
|
Map.put(acc, chip_name, temps)
|
||||||
|
else
|
||||||
|
acc
|
||||||
|
end
|
||||||
|
_ -> acc
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
end
|
||||||
|
defp simplify_sensors_data(_), do: %{}
|
||||||
|
|
||||||
|
defp extract_temperatures(chip_data) when is_map(chip_data) do
|
||||||
|
chip_data
|
||||||
|
|> Enum.reduce(%{}, fn {sensor_name, sensor_data}, acc ->
|
||||||
|
case sensor_data do
|
||||||
|
sensor_map when is_map(sensor_map) ->
|
||||||
|
temp_input = Map.get(sensor_map, "temp1_input") ||
|
||||||
|
Map.get(sensor_map, "temp2_input") ||
|
||||||
|
Map.get(sensor_map, "temp3_input")
|
||||||
|
if is_number(temp_input) do
|
||||||
|
Map.put(acc, sensor_name, temp_input)
|
||||||
|
else
|
||||||
|
acc
|
||||||
|
end
|
||||||
|
_ -> acc
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
end
|
||||||
|
defp extract_temperatures(_), do: %{}
|
||||||
|
|
||||||
|
# Process Metrics Implementation
|
||||||
|
defp get_top_processes do
|
||||||
|
try do
|
||||||
|
case System.cmd("ps", ["aux", "--sort=-pcpu", "--no-headers"]) do
|
||||||
|
{output, 0} ->
|
||||||
|
output
|
||||||
|
|> String.split("\n")
|
||||||
|
|> Enum.take(10) # Top 10 processes
|
||||||
|
|> Enum.filter(&(String.trim(&1) != ""))
|
||||||
|
|> Enum.map(&parse_process_line/1)
|
||||||
|
|> Enum.filter(&(&1 != nil))
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp parse_process_line(line) do
|
||||||
|
case String.split(line) do
|
||||||
|
[user, pid, cpu, mem, _vsz, _rss, _tty, _stat, _start, _time | command_parts] ->
|
||||||
|
%{
|
||||||
|
user: user,
|
||||||
|
pid: parse_int(pid),
|
||||||
|
cpu_percent: parse_float(cpu),
|
||||||
|
memory_percent: parse_float(mem),
|
||||||
|
command: Enum.join(command_parts, " ") |> String.slice(0, 50) # Limit command length
|
||||||
|
}
|
||||||
|
_ -> nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Helper functions
|
||||||
|
defp parse_int(str) when is_binary(str) do
|
||||||
|
case Integer.parse(str) do
|
||||||
|
{num, _} -> num
|
||||||
|
_ -> 0
|
||||||
|
end
|
||||||
|
end
|
||||||
|
defp parse_int(num) when is_integer(num), do: num
|
||||||
|
defp parse_int(_), do: 0
|
||||||
|
|
||||||
|
defp parse_float(str) when is_binary(str) do
|
||||||
|
case Float.parse(str) do
|
||||||
|
{num, _} -> num
|
||||||
|
_ -> 0.0
|
||||||
|
end
|
||||||
|
end
|
||||||
|
defp parse_float(num) when is_float(num), do: num
|
||||||
|
defp parse_float(num) when is_integer(num), do: num * 1.0
|
||||||
|
defp parse_float(_), do: 0.0
|
||||||
|
|
||||||
|
# Configuration-aware helper functions
|
||||||
|
defp maybe_add(map, _key, _value, false), do: map
|
||||||
|
defp maybe_add(map, _key, _value, nil), do: map
|
||||||
|
defp maybe_add(map, key, value, _), do: Map.put(map, key, value)
|
||||||
|
|
||||||
|
defp get_kernel_version do
|
||||||
|
case File.read("/proc/version") do
|
||||||
|
{:ok, content} -> content |> String.trim() |> String.slice(0, 100)
|
||||||
|
_ -> nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_os_info do
|
||||||
|
try do
|
||||||
|
case File.read("/etc/os-release") do
|
||||||
|
{:ok, content} ->
|
||||||
|
content
|
||||||
|
|> String.split("\n")
|
||||||
|
|> Enum.reduce(%{}, fn line, acc ->
|
||||||
|
case String.split(line, "=", parts: 2) do
|
||||||
|
[key, value] ->
|
||||||
|
clean_value = String.trim(value, "\"")
|
||||||
|
Map.put(acc, String.downcase(key), clean_value)
|
||||||
|
_ -> acc
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
|> Map.take(["name", "version", "id", "version_id"])
|
||||||
|
_ -> %{}
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> %{}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Update helper functions to accept config parameter
|
||||||
|
defp get_disk_usage(config) do
|
||||||
|
disk_config = Systant.Config.get(config, ["disk"]) || %{}
|
||||||
|
|
||||||
|
try do
|
||||||
|
case System.cmd("df", ["-h", "--exclude-type=tmpfs", "--exclude-type=devtmpfs"]) do
|
||||||
|
{output, 0} ->
|
||||||
|
disks = output
|
||||||
|
|> String.split("\n")
|
||||||
|
|> Enum.drop(1)
|
||||||
|
|> Enum.filter(&(String.trim(&1) != ""))
|
||||||
|
|> Enum.map(fn line ->
|
||||||
|
case String.split(line) do
|
||||||
|
[filesystem, size, used, available, use_percent, mounted_on] ->
|
||||||
|
%{
|
||||||
|
filesystem: filesystem,
|
||||||
|
size: size,
|
||||||
|
used: used,
|
||||||
|
available: available,
|
||||||
|
use_percent: String.replace(use_percent, "%", "") |> parse_percentage(),
|
||||||
|
mounted_on: mounted_on
|
||||||
|
}
|
||||||
|
_ -> nil
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
|> Enum.filter(&(&1 != nil))
|
||||||
|
|> filter_disks(disk_config)
|
||||||
|
|
||||||
|
%{disks: disks}
|
||||||
|
_ -> %{disks: []}
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> %{disks: []}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp filter_disks(disks, config) do
|
||||||
|
include_mounts = config["include_mounts"] || []
|
||||||
|
exclude_mounts = config["exclude_mounts"] || []
|
||||||
|
exclude_types = config["exclude_types"] || []
|
||||||
|
min_usage = config["min_usage_percent"] || 0
|
||||||
|
|
||||||
|
disks
|
||||||
|
|> Enum.filter(fn disk ->
|
||||||
|
# Include filter (if specified)
|
||||||
|
include_match = if Enum.empty?(include_mounts) do
|
||||||
|
true
|
||||||
|
else
|
||||||
|
Enum.any?(include_mounts, &String.contains?(disk.mounted_on, &1))
|
||||||
|
end
|
||||||
|
|
||||||
|
# Exclude filter
|
||||||
|
exclude_match = Enum.any?(exclude_mounts, &String.starts_with?(disk.mounted_on, &1))
|
||||||
|
type_exclude = Enum.any?(exclude_types, &String.contains?(disk.filesystem, &1))
|
||||||
|
|
||||||
|
# Usage filter
|
||||||
|
usage_ok = disk.use_percent >= min_usage
|
||||||
|
|
||||||
|
include_match and not exclude_match and not type_exclude and usage_ok
|
||||||
|
end)
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_nvidia_gpu_info(config) do
|
||||||
|
gpu_config = Systant.Config.get(config, ["gpu"]) || %{}
|
||||||
|
max_gpus = gpu_config["max_gpus"] || 8
|
||||||
|
|
||||||
|
try do
|
||||||
|
case System.cmd("nvidia-smi", ["--query-gpu=name,utilization.gpu,utilization.memory,temperature.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"]) do
|
||||||
|
{output, 0} ->
|
||||||
|
output
|
||||||
|
|> String.split("\n")
|
||||||
|
|> Enum.filter(&(String.trim(&1) != ""))
|
||||||
|
|> Enum.take(max_gpus)
|
||||||
|
|> Enum.with_index()
|
||||||
|
|> Enum.map(fn {line, index} ->
|
||||||
|
case String.split(line, ", ") do
|
||||||
|
[name, gpu_util, mem_util, temp, mem_used, mem_total] ->
|
||||||
|
%{
|
||||||
|
id: index,
|
||||||
|
name: String.trim(name),
|
||||||
|
utilization_percent: parse_int(gpu_util),
|
||||||
|
memory_utilization_percent: parse_int(mem_util),
|
||||||
|
temperature_c: parse_int(temp),
|
||||||
|
memory_used_mb: parse_int(mem_used),
|
||||||
|
memory_total_mb: parse_int(mem_total)
|
||||||
|
}
|
||||||
|
_ -> nil
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
|> Enum.filter(&(&1 != nil))
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_amd_gpu_info(config) do
|
||||||
|
gpu_config = Systant.Config.get(config, ["gpu"]) || %{}
|
||||||
|
max_gpus = gpu_config["max_gpus"] || 8
|
||||||
|
|
||||||
|
try do
|
||||||
|
case System.cmd("rocm-smi", ["--showuse", "--showtemp", "--showmemuse", "--csv"]) do
|
||||||
|
{output, 0} ->
|
||||||
|
parse_rocm_smi_output(output) |> Enum.take(max_gpus)
|
||||||
|
_ ->
|
||||||
|
get_amd_sysfs_info() |> Enum.take(max_gpus)
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_network_stats(config) do
|
||||||
|
network_config = Systant.Config.get(config, ["network"]) || %{}
|
||||||
|
|
||||||
|
try do
|
||||||
|
case File.read("/proc/net/dev") do
|
||||||
|
{:ok, content} ->
|
||||||
|
content
|
||||||
|
|> String.split("\n")
|
||||||
|
|> Enum.drop(2)
|
||||||
|
|> Enum.filter(&(String.trim(&1) != ""))
|
||||||
|
|> Enum.map(&parse_network_interface/1)
|
||||||
|
|> Enum.filter(&(&1 != nil))
|
||||||
|
|> filter_network_interfaces(network_config)
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp filter_network_interfaces(interfaces, config) do
|
||||||
|
include_interfaces = config["include_interfaces"] || []
|
||||||
|
exclude_interfaces = config["exclude_interfaces"] || []
|
||||||
|
min_bytes = config["min_bytes_threshold"] || 0
|
||||||
|
|
||||||
|
interfaces
|
||||||
|
|> Enum.filter(fn iface ->
|
||||||
|
# Include filter
|
||||||
|
include_match = if Enum.empty?(include_interfaces) do
|
||||||
|
true
|
||||||
|
else
|
||||||
|
Enum.member?(include_interfaces, iface.interface)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Exclude filter
|
||||||
|
exclude_match = Enum.any?(exclude_interfaces, fn pattern ->
|
||||||
|
String.starts_with?(iface.interface, pattern)
|
||||||
|
end)
|
||||||
|
|
||||||
|
# Traffic threshold
|
||||||
|
has_traffic = (iface.rx_bytes + iface.tx_bytes) >= min_bytes
|
||||||
|
|
||||||
|
include_match and not exclude_match and has_traffic
|
||||||
|
end)
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_temperature_data(config) do
|
||||||
|
temp_config = Systant.Config.get(config, ["temperature"]) || %{}
|
||||||
|
|
||||||
|
result = %{}
|
||||||
|
|
||||||
|
result = if temp_config["cpu_temp_enabled"] != false do
|
||||||
|
Map.put(result, :cpu, get_cpu_temperature())
|
||||||
|
else
|
||||||
|
result
|
||||||
|
end
|
||||||
|
|
||||||
|
result = if temp_config["sensors_enabled"] != false do
|
||||||
|
Map.put(result, :sensors, get_lm_sensors_data())
|
||||||
|
else
|
||||||
|
result
|
||||||
|
end
|
||||||
|
|
||||||
|
result
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_top_processes(config) do
|
||||||
|
process_config = Systant.Config.get(config, ["processes"]) || %{}
|
||||||
|
max_processes = process_config["max_processes"] || 10
|
||||||
|
sort_by = process_config["sort_by"] || "cpu"
|
||||||
|
min_cpu = process_config["min_cpu_percent"] || 0.0
|
||||||
|
min_memory = process_config["min_memory_percent"] || 0.0
|
||||||
|
max_cmd_len = process_config["max_command_length"] || 50
|
||||||
|
|
||||||
|
sort_flag = case sort_by do
|
||||||
|
"memory" -> "-pmem"
|
||||||
|
_ -> "-pcpu"
|
||||||
|
end
|
||||||
|
|
||||||
|
try do
|
||||||
|
case System.cmd("ps", ["aux", "--sort=#{sort_flag}", "--no-headers"]) do
|
||||||
|
{output, 0} ->
|
||||||
|
output
|
||||||
|
|> String.split("\n")
|
||||||
|
|> Enum.take(max_processes * 2) # Get extra in case filtering removes some
|
||||||
|
|> Enum.filter(&(String.trim(&1) != ""))
|
||||||
|
|> Enum.map(&parse_process_line(&1, max_cmd_len))
|
||||||
|
|> Enum.filter(&(&1 != nil))
|
||||||
|
|> Enum.filter(fn proc ->
|
||||||
|
proc.cpu_percent >= min_cpu and proc.memory_percent >= min_memory
|
||||||
|
end)
|
||||||
|
|> Enum.take(max_processes)
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
_ -> []
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp parse_process_line(line, max_cmd_len) do
|
||||||
|
case String.split(line) do
|
||||||
|
[user, pid, cpu, mem, _vsz, _rss, _tty, _stat, _start, _time | command_parts] ->
|
||||||
|
%{
|
||||||
|
user: user,
|
||||||
|
pid: parse_int(pid),
|
||||||
|
cpu_percent: parse_float(cpu),
|
||||||
|
memory_percent: parse_float(mem),
|
||||||
|
command: Enum.join(command_parts, " ") |> String.slice(0, max_cmd_len)
|
||||||
|
}
|
||||||
|
_ -> nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
@ -15,7 +15,7 @@ defmodule SystemStatsDaemon.MixProject do
|
|||||||
# Run "mix help compile.app" to learn about applications.
|
# Run "mix help compile.app" to learn about applications.
|
||||||
def application do
|
def application do
|
||||||
[
|
[
|
||||||
extra_applications: [:logger, :os_mon],
|
extra_applications: [:logger],
|
||||||
mod: {Systant.Application, []}
|
mod: {Systant.Application, []}
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
@ -24,7 +24,8 @@ defmodule SystemStatsDaemon.MixProject do
|
|||||||
defp deps do
|
defp deps do
|
||||||
[
|
[
|
||||||
{:tortoise, "~> 0.9.5"},
|
{:tortoise, "~> 0.9.5"},
|
||||||
{:jason, "~> 1.4"}
|
{:jason, "~> 1.4"},
|
||||||
|
{:toml, "~> 0.7"}
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@ -6,5 +6,6 @@
|
|||||||
"gun": {:hex, :gun, "2.1.0", "b4e4cbbf3026d21981c447e9e7ca856766046eff693720ba43114d7f5de36e87", [:make, :rebar3], [{:cowlib, "2.13.0", [hex: :cowlib, repo: "hexpm", optional: false]}], "hexpm", "52fc7fc246bfc3b00e01aea1c2854c70a366348574ab50c57dfe796d24a0101d"},
|
"gun": {:hex, :gun, "2.1.0", "b4e4cbbf3026d21981c447e9e7ca856766046eff693720ba43114d7f5de36e87", [:make, :rebar3], [{:cowlib, "2.13.0", [hex: :cowlib, repo: "hexpm", optional: false]}], "hexpm", "52fc7fc246bfc3b00e01aea1c2854c70a366348574ab50c57dfe796d24a0101d"},
|
||||||
"jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"},
|
"jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"},
|
||||||
"syslog": {:hex, :syslog, "1.1.0", "6419a232bea84f07b56dc575225007ffe34d9fdc91abe6f1b2f254fd71d8efc2", [:rebar3], [], "hexpm", "4c6a41373c7e20587be33ef841d3de6f3beba08519809329ecc4d27b15b659e1"},
|
"syslog": {:hex, :syslog, "1.1.0", "6419a232bea84f07b56dc575225007ffe34d9fdc91abe6f1b2f254fd71d8efc2", [:rebar3], [], "hexpm", "4c6a41373c7e20587be33ef841d3de6f3beba08519809329ecc4d27b15b659e1"},
|
||||||
|
"toml": {:hex, :toml, "0.7.0", "fbcd773caa937d0c7a02c301a1feea25612720ac3fa1ccb8bfd9d30d822911de", [:mix], [], "hexpm", "0690246a2478c1defd100b0c9b89b4ea280a22be9a7b313a8a058a2408a2fa70"},
|
||||||
"tortoise": {:hex, :tortoise, "0.9.9", "2e467570ef1d342d4de8fdc6ba3861f841054ab524080ec3d7052ee07c04501d", [:mix], [{:gen_state_machine, "~> 2.0 or ~> 3.0", [hex: :gen_state_machine, repo: "hexpm", optional: false]}], "hexpm", "4a316220b4b443c2497f42702f0c0616af3e4b2cbc6c150ebebb51657a773797"},
|
"tortoise": {:hex, :tortoise, "0.9.9", "2e467570ef1d342d4de8fdc6ba3861f841054ab524080ec3d7052ee07c04501d", [:mix], [{:gen_state_machine, "~> 2.0 or ~> 3.0", [hex: :gen_state_machine, repo: "hexpm", optional: false]}], "hexpm", "4a316220b4b443c2497f42702f0c0616af3e4b2cbc6c150ebebb51657a773797"},
|
||||||
}
|
}
|
||||||
|
|||||||
96
server/systant.toml
Normal file
96
server/systant.toml
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
# Systant Configuration File
|
||||||
|
# This file controls which metrics are collected and how they're reported
|
||||||
|
|
||||||
|
[general]
|
||||||
|
# Enable/disable entire metric categories
|
||||||
|
enabled_modules = ["cpu", "memory", "disk", "gpu", "network", "temperature", "processes", "system"]
|
||||||
|
|
||||||
|
# Collection intervals (in milliseconds)
|
||||||
|
collection_interval = 30000 # 30 seconds
|
||||||
|
startup_delay = 5000 # 5 seconds
|
||||||
|
|
||||||
|
[cpu]
|
||||||
|
# CPU metrics are always lightweight, no specific options needed
|
||||||
|
enabled = true
|
||||||
|
|
||||||
|
[memory]
|
||||||
|
enabled = true
|
||||||
|
# Show detailed breakdown (buffers, cached, etc.)
|
||||||
|
show_detailed = true
|
||||||
|
|
||||||
|
[disk]
|
||||||
|
enabled = true
|
||||||
|
# Specific mount points to monitor (empty = all)
|
||||||
|
include_mounts = []
|
||||||
|
# Mount points to exclude
|
||||||
|
exclude_mounts = ["/snap", "/boot", "/dev", "/sys", "/proc", "/run", "/tmp"]
|
||||||
|
# Filesystem types to exclude
|
||||||
|
exclude_types = ["tmpfs", "devtmpfs", "squashfs", "overlay"]
|
||||||
|
# Only show disks above this usage percentage
|
||||||
|
min_usage_percent = 1
|
||||||
|
|
||||||
|
[gpu]
|
||||||
|
enabled = true
|
||||||
|
# Enable NVIDIA GPU monitoring (requires nvidia-smi)
|
||||||
|
nvidia_enabled = true
|
||||||
|
# Enable AMD GPU monitoring (requires rocm-smi or sysfs)
|
||||||
|
amd_enabled = true
|
||||||
|
# Maximum number of GPUs to report
|
||||||
|
max_gpus = 8
|
||||||
|
|
||||||
|
[network]
|
||||||
|
enabled = true
|
||||||
|
# Specific interfaces to monitor (empty = all)
|
||||||
|
include_interfaces = []
|
||||||
|
# Interfaces to exclude (common virtual/loopback interfaces)
|
||||||
|
exclude_interfaces = ["lo", "docker0", "br-", "veth", "virbr"]
|
||||||
|
# Only show interfaces with traffic above this threshold (bytes)
|
||||||
|
min_bytes_threshold = 1024
|
||||||
|
|
||||||
|
[temperature]
|
||||||
|
enabled = true
|
||||||
|
# Enable CPU temperature monitoring
|
||||||
|
cpu_temp_enabled = true
|
||||||
|
# Enable lm-sensors integration (requires 'sensors' command)
|
||||||
|
sensors_enabled = true
|
||||||
|
# Temperature units: "celsius" or "fahrenheit"
|
||||||
|
temp_unit = "celsius"
|
||||||
|
|
||||||
|
[processes]
|
||||||
|
enabled = true
|
||||||
|
# Number of top processes to report
|
||||||
|
max_processes = 10
|
||||||
|
# Sort by: "cpu" or "memory"
|
||||||
|
sort_by = "cpu"
|
||||||
|
# Minimum CPU percentage to include process
|
||||||
|
min_cpu_percent = 0.1
|
||||||
|
# Minimum memory percentage to include process
|
||||||
|
min_memory_percent = 0.1
|
||||||
|
# Truncate command names to this length
|
||||||
|
max_command_length = 50
|
||||||
|
|
||||||
|
[system]
|
||||||
|
enabled = true
|
||||||
|
# Additional system info to collect
|
||||||
|
include_uptime = true
|
||||||
|
include_load_average = true
|
||||||
|
include_kernel_version = true
|
||||||
|
include_os_info = true
|
||||||
|
|
||||||
|
# MQTT Configuration (can be overridden by environment variables)
|
||||||
|
[mqtt]
|
||||||
|
host = "mqtt.home"
|
||||||
|
port = 1883
|
||||||
|
client_id_prefix = "systant"
|
||||||
|
username = ""
|
||||||
|
password = ""
|
||||||
|
# Topics are auto-generated as: systant/{hostname}/stats and systant/{hostname}/commands
|
||||||
|
# QoS level (0, 1, or 2)
|
||||||
|
qos = 0
|
||||||
|
|
||||||
|
[logging]
|
||||||
|
# Log level: "debug", "info", "warning", "error"
|
||||||
|
level = "info"
|
||||||
|
# Log configuration loading and metric collection details
|
||||||
|
log_config_changes = true
|
||||||
|
log_metric_collection = false
|
||||||
Loading…
Reference in New Issue
Block a user