System Monitoring with CRON and Heartbeat Sensors
View SourceOverview
Jido provides two included system monitoring sensors: CRON and Heartbeat. These sensors form the foundation of scheduled task execution and system health monitoring in your agent-based applications. This guide explores their implementation, configuration patterns, and best practices for production use.
Core Concepts
CRON Sensor
The CRON sensor integrates with Quantum to provide schedule-based signal emission. Key features:
- Schedule-based task execution
- Dynamic job management
- Configurable dispatch targets
- Built-in error handling
Heartbeat Sensor
The Heartbeat sensor provides continuous system health monitoring through:
- Configurable interval-based signals
- Customizable health messages
- Multi-target dispatch support
- Timestamp tracking
Implementation Guide
Setting Up CRON Monitoring
Basic Configuration
# Initialize a CRON sensor with basic jobs
{:ok, cron_sensor} = Jido.Sensors.Cron.start_link(
id: "system_tasks",
target: {:bus, [target: :system_bus, stream: "scheduled_events"]},
scheduler: Jido.Scheduler,
jobs: [
# Run every minute
{~e"* * * * *"e, :minute_task},
# Run at specific times
{:daily_backup, ~e"0 0 * * *"e, :backup_task},
{:weekly_cleanup, ~e"0 0 * * 0"e, :cleanup_task}
]
)
Dynamic Job Management
# Add a new job
:ok = Jido.Sensors.Cron.add_job(cron_sensor,
:metrics_collection,
~e"*/5 * * * *"e, # Every 5 minutes
:collect_metrics
)
# Temporary deactivation
:ok = Jido.Sensors.Cron.deactivate_job(cron_sensor, :metrics_collection)
# Reactivation
:ok = Jido.Sensors.Cron.activate_job(cron_sensor, :metrics_collection)
# Manual execution
:ok = Jido.Sensors.Cron.run_job(cron_sensor, :metrics_collection)
# Remove job
:ok = Jido.Sensors.Cron.remove_job(cron_sensor, :metrics_collection)
Setting Up Health Monitoring
Basic Heartbeat Configuration
# Initialize a basic heartbeat monitor
{:ok, heartbeat} = Jido.Sensors.Heartbeat.start_link(
id: "service_health",
target: {:bus, [target: :monitoring_bus]},
interval: 5000, # 5 second interval
message: "service_health_check"
)
Advanced Multi-Target Setup
# Configure heartbeat with multiple dispatch targets
{:ok, critical_service_monitor} = Jido.Sensors.Heartbeat.start_link(
id: "critical_service",
target: [
{:bus, [target: :monitoring_bus, stream: "health"]},
{:logger, [level: :info]},
{:pubsub, [topic: "system.health"]}
],
interval: 1000, # 1 second for critical services
message: "critical_service_status"
)
Signal Structure and Handling
CRON Signals
CRON sensors emit signals with the following structure:
%Jido.Signal{
source: "cron_sensor:system_tasks:daily_backup",
type: "cron_trigger",
data: %{
name: :daily_backup,
schedule: "0 0 * * *",
task: :backup_task,
triggered_at: ~U[2024-02-11 00:00:00Z]
}
}
Heartbeat Signals
Heartbeat sensors emit signals structured as:
%Jido.Signal{
source: "heartbeat_sensor:service_health",
type: "heartbeat",
data: %{
message: "service_health_check",
timestamp: ~U[2024-02-11 10:00:00Z],
last_beat: ~U[2024-02-11 09:59:55Z]
}
}
Best Practices
CRON Job Management
Naming Conventions
- Use descriptive atom names (e.g.,
:daily_user_cleanup
) - Follow consistent naming patterns
- Add domain-specific prefixes for large systems
- Use descriptive atom names (e.g.,
Schedule Planning
# Spread out resource-intensive jobs jobs: [ {:heavy_task_1, ~e"0 */4 * * *"e, :resource_intensive_1}, {:heavy_task_2, ~e"0 2-23/4 * * *"e, :resource_intensive_2} ]
Error Handling
defmodule TaskHandler do def handle_signal(%{type: "cron_trigger"} = signal) do try do signal.data.task |> execute_task() |> handle_task_result() rescue error -> Logger.error("Task execution failed: #{inspect(error)}") {:error, :task_failed} catch kind, reason -> Logger.error("Unexpected error: #{inspect({kind, reason})}") {:error, :unexpected_failure} end end defp handle_task_result({:ok, _} = result), do: result defp handle_task_result({:error, _} = error), do: error defp handle_task_result(_), do: {:error, :invalid_result} end
Heartbeat Monitoring
Interval Configuration
# Adjust intervals based on service criticality defmodule MonitoringConfig do def get_interval(:critical), do: 1_000 # 1 second def get_interval(:important), do: 5_000 # 5 seconds def get_interval(:routine), do: 30_000 # 30 seconds end
Health Check Implementation
defmodule HealthMonitor do use Jido.Agent, name: "health_monitor" @max_delay_ms 10_000 # Maximum acceptable delay def handle_signal(%{type: "heartbeat"} = signal) do delay = calculate_delay(signal.data) cond do delay > @max_delay_ms -> trigger_alert(signal.source, delay) delay > @max_delay_ms / 2 -> Logger.warning("Service #{signal.source} showing increased latency") true -> Logger.debug("Service #{signal.source} healthy") end end defp calculate_delay(%{timestamp: current, last_beat: last}) do DateTime.diff(current, last, :millisecond) end end
Testing Strategies
CRON Testing
defmodule CronSensorTest do
use ExUnit.Case
setup do
{:ok, sensor} = Jido.Sensors.Cron.start_link(
id: "test_cron",
target: {:pid, target: self()},
scheduler: Jido.Scheduler
)
{:ok, sensor: sensor}
end
test "schedules and executes jobs", %{sensor: sensor} do
:ok = Jido.Sensors.Cron.add_job(
sensor,
:test_job,
~e"* * * * * *"e,
:test_task
)
assert_receive {:signal, {:ok, signal}}, 1000
assert signal.type == "cron_trigger"
assert signal.data.name == :test_job
end
test "handles job deactivation", %{sensor: sensor} do
:ok = Jido.Sensors.Cron.add_job(
sensor,
:inactive_job,
~e"* * * * * *"e,
:test_task
)
:ok = Jido.Sensors.Cron.deactivate_job(sensor, :inactive_job)
refute_receive {:signal, _}, 1500
end
end
Heartbeat Testing
defmodule HeartbeatSensorTest do
use ExUnit.Case
test "emits regular heartbeats with correct data" do
{:ok, _sensor} = Jido.Sensors.Heartbeat.start_link(
id: "test_heartbeat",
target: {:pid, target: self()},
interval: 100,
message: "test_heartbeat"
)
# Verify multiple heartbeats
for _i <- 1..3 do
assert_receive {:signal, {:ok, signal}}, 200
assert signal.type == "heartbeat"
assert signal.data.message == "test_heartbeat"
assert %DateTime{} = signal.data.timestamp
assert %DateTime{} = signal.data.last_beat
end
end
end
Production Patterns
System Maintenance Automation
defmodule MaintenanceOrchestrator do
def start_maintenance_sensors do
# Daily maintenance tasks
{:ok, _daily} = Jido.Sensors.Cron.start_link(
id: "daily_maintenance",
target: {:bus, [target: :maintenance_bus]},
jobs: [
{:log_rotation, ~e"0 0 * * *"e, :rotate_logs},
{:temp_cleanup, ~e"0 4 * * *"e, :clean_temp},
{:metrics_rollup, ~e"*/30 * * * *"e, :rollup_metrics}
]
)
# Health monitoring
{:ok, _health} = Jido.Sensors.Heartbeat.start_link(
id: "system_health",
target: [
{:bus, [target: :health_bus]},
{:logger, [level: :info]}
],
interval: 10_000
)
end
end
Service Health Dashboard
defmodule HealthDashboard do
use Jido.Agent,
name: "health_dashboard"
def init(state) do
{:ok, Map.put(state, :service_status, %{})}
end
def handle_signal(%{type: "heartbeat"} = signal) do
service = signal.source
status = evaluate_health(signal.data)
update_status(service, status)
maybe_trigger_alerts(service, status)
end
defp evaluate_health(data) do
case calculate_delay(data) do
delay when delay > 30_000 -> :critical
delay when delay > 10_000 -> :warning
_ -> :healthy
end
end
defp calculate_delay(%{timestamp: current, last_beat: last}) do
DateTime.diff(current, last, :millisecond)
end
end
See Also
Contributing
Found an issue or have a suggestion? Please open an issue or submit a pull request on our GitHub repository.