ヤマハルーターの状態を監視し異常を通知する

Last updated at 2024-10-25Posted at 2024-10-25
概要

ヤマハ公式にて、ルーターの状態を監視して異常を通知するコードをサンプル公開されています。複数のCPUコアには対応していないようだったので少し変更し、複数コアのチェックを行うものにしました。
この変更により CPU0やCPU1だけが特出して異常値になった場合も検出することができます。
他にも、異常の通知と正常の通知でメールサブジェクトを変える処理を入れました。
LUA スクリプト

--[[

  ●ルーターリソース監視スクリプト
　　一定の監視間隔毎にルーターの状態を監視し、設定した閾値を超えたら管理者に
　　メールを送信して知らせるスクリプトです。
　　監視する情報は以下の通りです。
　　・CPU 使用率
　　・メモリ使用率

　　各リソースの値が閾値を指定した回数だけ連続して超えた場合に、管理者にメール
　　を送信します。その後、値が閾値を指定した回数だけ連続して下回った場合には、
　　正常値に戻ったと判断します。設定値 down_mail を true に設定している場合には、
　　正常値に戻った際にもメールを送信します。

  ＜説明＞
  ・このファイルを RTFS か外部メモリに保存してください。
  ・本項目の config の設定では schedule at コマンドでルーター起動時に Lua スク
  　リプトが実行されるように設定しています。
  ・スクリプトを停止するときは terminate lua コマンドを実行してください。
  ・再度、Lua スクリプトを実行する場合は lua コマンドで実行してください。
  ・★マークの付いた設定値は変更が可能です。

  ＜ノート＞
　・メールの送信失敗時に出力する SYSLOG レベルを指定可能です。
　　SYSLOG のレベルを指定するには、log_level を設定してください。
　　debug レベル、notice レベルの SYSLOG を出力するためには、それぞれ以下の設定
　　が必要です。
　　　debug レベル ･･･ syslog debug on
　　　notice レベル･･･ syslog notice on
　・本スクリプトファイルを編集する場合、文字コードは必ず Shift-JIS を使用してく
　　ださい。

]]

--------------------------##  設定値  ##--------------------------------
-- 監視間隔（1 - 864000 秒）
idle_time = (監視間隔)			-- ★

-- CPU 使用率を監視する単位時間（"5sec" or "1min" or "5min"）
cpu_time = "(単位時間)"			-- ★

-- 各リソースの閾値（1 - 99）
th_tbl = {
  cpu = (CPU使用率（%）),
  mem = (メモリ使用率（%）)
}

-- 連続で閾値を超えたら異常と判断する回数、または正常な状態に復帰したと判断する回数（1, 2 ..）
count = (回数)				-- ★

-- 正常な状態に復帰した場合にもメールを送るか否か（送る: true / 送らない: false）
down_mail = (true / false)		-- ★

-- メールの設定
mail_tbl = {				-- ★
  smtp_address = "(SMTP サーバーのアドレス)",
  from = "(送信元メールアドレス)",
  to = "(宛先メールアドレス)"
}

-- メールの送信に失敗した時に出力する SYSLOG のレベル (info, debug, notice)
log_level = "(SYSLOG レベル)"		-- ★

----------------------##  設定値ここまで  ##----------------------------

------------------------------------------------------------
-- CPUコアの数を検出する関数                              --
------------------------------------------------------------
function detect_cpu_cores()
  local rtn, str = rt.command("show environment")
  if not (rtn and str) then
    return 1
  end

  local max_core = 0
  for line in str:gmatch("[^\r\n]+") do
    local core_num = line:match("^CPU(%d+):")
    if core_num then
      core_num = tonumber(core_num)
      if core_num > max_core then
        max_core = core_num
      end
    end
  end
  
  return max_core + 1
end

------------------------------------------------------------
-- 指定した単位時間のCPU使用率を取得するための検索文字列  --
-- を設定する関数                                         --
------------------------------------------------------------
function set_cpu_ptn(key, num_cores)
  local patterns = {}
  if (key == "5sec") or (key == "1min") or (key == "5min") then
    -- 各CPUコア用のパターンを追加 - スペースを柔軟に処理
    for i = 0, num_cores - 1 do
      patterns[i] = "CPU" .. i .. ":%s*(%d+)%%%s*%(" .. key .. "%)"
    end
    return patterns
  end
  return nil
end

------------------------------------------------------------
-- ルーターのハードウェアリソースの使用状況を取得する関数 --
------------------------------------------------------------
function rt_res_status(t)
  local rtn, str
  local cmd = "show environment"
  
  rtn, str = rt.command(cmd)
  if (rtn) and (str) then
    -- メモリ使用率の取得
    t.mem.val = str:match(t.mem.ptn)
    if (t.mem.val) then
      t.mem.val = tonumber(t.mem.val)
    end

    -- 各CPUコアの使用率を取得
    for i = 0, #t.cpu do
      local cpu_val = str:match(t.cpu[i].ptn)
      if (cpu_val) then
        t.cpu[i].val = tonumber(cpu_val)
        -- デバッグ用のログ出力
        rt.syslog("debug", string.format("CPU%d: %d%% (%s)", i, t.cpu[i].val, cpu_time))
      else
        rt.syslog("debug", string.format("Failed to match CPU%d pattern: %s", i, t.cpu[i].ptn))
      end
    end
  else
    str = cmd .. "コマンド実行失敗\r\n\r\n"
  end

  return rtn, str
end

------------------------------------------------------------
-- 各状態の数値が閾値を超えた時、または正常に復帰した時に --
-- メッセージを返す関数                                   --
------------------------------------------------------------
function make_msg(t, val, th, down)
  local str = ""
  local is_recovered = false

  if (val) then
    local rtn = count_proc(t, val, th)
    if (rtn ~= 0) then
      if (rtn < 0 and down) then
        str = t.title .. "が閾値以下の値に下がりました。\r\n"
        str = str .. string.format("  %s: %d%s\r\n  閾値: %d%s\r\n\r\n",
             t.title, val, t.unit, th, t.unit)
        is_recovered = true
      elseif (rtn > 0) then
        str = t.title .. "が閾値を超えました。\r\n"
        str = str .. string.format("  %s: %d%s\r\n  閾値: %d%s\r\n\r\n",
             t.title, val, t.unit, th, t.unit)
      end
    end
  end

  return str, is_recovered
end

------------------------------------------------------------
-- 閾値を超えた、または下回った連続回数をカウントする関数 --
------------------------------------------------------------
function count_proc(t, val, th)
  local rtn = 0

  if (val > th) then
    if (not t.flag) then
      t.over = t.over + 1
      if (t.over == count) then
        rtn = 1
        t.flag = true
      end
    else
      if (t.down > 0) then
        t.down = 0
      end
    end
  else
    if (t.flag) then
      t.down = t.down + 1
      if (t.down == count) then
        rtn = -1
        t.flag = false
        t.over = 0
        t.down = 0
      end
    else
      if (t.over > 0) then
        t.over = 0
      end
    end
  end

  return rtn
end

------------------------------------------------------------
-- 現在の日時を取得する関数                               --
------------------------------------------------------------
function time_stamp()
	local t

	t = os.date("*t")
	return string.format("%d/%02d/%02d %02d:%02d:%02d", 
		t.year, t.month, t.day, t.hour, t.min, t.sec)
end

------------------------------------------------------------
-- メインルーチン                                         --
------------------------------------------------------------
-- CPUコア数を検出
local num_cpu_cores = detect_cpu_cores()
rt.syslog("info", string.format("Detected %d CPU cores", num_cpu_cores))

-- ハードウェアリソース情報テーブル
local res_tbl = {
  cpu = {}, -- CPUコア別の情報を格納する配列
  mem = {ptn = "(%d+)%% used", val = 0, over = 0, down = 0, flag = false, title = "メモリ使用率", unit = "%"},
}

-- CPUコア別の監視情報を初期化
local cpu_patterns = set_cpu_ptn(cpu_time, num_cpu_cores)
assert(cpu_patterns)
for i = 0, num_cpu_cores - 1 do
  res_tbl.cpu[i] = {
    ptn = cpu_patterns[i],
    val = 0,
    over = 0,
    down = 0,
    flag = false,
    title = string.format("CPU%d負荷率(%s)", i, cpu_time),
    unit = "%"
  }
end

local rtn, str
local rt_name = string.match(_RT_FIRM_REVISION, "(%w+)")

mail_table = {}
while (true) do
  mail_table.text = ""
  local any_recovered = false

  -- cpu, mem の状態を取得
  rtn, str = rt_res_status(res_tbl)
  if (rtn) then
    -- 各CPUコアの状態をチェック
    for i = 0, #res_tbl.cpu do
      local msg, is_recovered = make_msg(res_tbl.cpu[i], res_tbl.cpu[i].val, th_tbl.cpu, down_mail)
      mail_table.text = mail_table.text .. msg
      any_recovered = any_recovered or is_recovered
    end
    -- メモリの状態をチェック
    local msg, is_recovered = make_msg(res_tbl.mem, res_tbl.mem.val, th_tbl.mem, down_mail)
    mail_table.text = mail_table.text .. msg
    any_recovered = any_recovered or is_recovered
  end

  if (mail_table.text:len() > 0) then
    local subject_base = string.format("resource loadwatch (%s)", time_stamp())
    mail_table.subject = any_recovered and subject_base .. " (解消)" or subject_base
    rtn = rt.mail(mail_table)
    if (not rtn) then
      rt.syslog(log_level, "failed to send mail. (Lua スクリプトファイル名)")
    end
  end

  rt.sleep(idle_time)
end
参考

ヤマハネットワーク:
You get articles that match your needs
You can efficiently read back useful information
You can use dark theme
What you can do with signing up