heartbeat

Simple server monitor system using encrypted messages over udp
git clone https://noulin.net/git/heartbeat.git
Log | Files | Refs | README

commit 063884af53ec02906c8e7726468b234276eef3ca
parent 78ce02cda96526ba8de587dfcfe7e69d73d5c320
Author: Remy Noulin <loader2x@gmail.com>
Date:   Mon, 10 Jul 2023 14:09:57 +0200

improve monitor and the logic for detecting reboots and network issues

heartbeat.c | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 155 insertions(+), 31 deletions(-)

Diffstat:
Mheartbeat.c | 186++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 155 insertions(+), 31 deletions(-)

diff --git a/heartbeat.c b/heartbeat.c @@ -18,6 +18,11 @@ config: generate configuration yml files for each machine copy heartbeat executable daemon: run as daemon monitor: display state + + +msgt is the packets sent with udp from the agents +dbt is the biggest possible packet, it is written in the db files when there is a state +change (event). */ @@ -25,7 +30,15 @@ monitor: display state // it used in the config command to generate agentCfgFile for each machine #define cfgFile "heartbeatConfig.yml" +// cleartime is used to ignore events that happened before cleartime +// press c to set cleartime to now +#define monitorCleartimeFile "cleartime.json" + +// when running as a daemon, expanding ~/ doesn't work #define uHome "/root/" +// monitor user home, monitor doesn't run as a daemon and +// can run as a normal user +#define muHome "~/" #define home ".heartbeat" // agentCfgFile is the default configuration file @@ -38,7 +51,7 @@ monitor: display state // TODO #define defaultPeriod 120 // 10mn timeout -#define agentTimeOut 5 +#define agentTimeOut 8 //TODO #define agentTimeOut 600 int argc; char **argv; @@ -48,9 +61,13 @@ int argc; char **argv; /* #define pLog(...) */ void printHelp(void); +// generate configurations and setup script void config(smallJsont *cfg); +// run commands with ssh on agents void runcommand(void); +// show state void monitor(void); +// send state to logger, as logger collect packets from agents void probe(char *cfgfile); /* process arguments on command line @@ -261,11 +278,13 @@ void monitor(void) { // load config // find logger config // create monitor socket + // events older than clearTime are not displayed + // load cleartime // setup terminal // infinite loop cleanAllocateSmallJson(cfg); - cleanCharP(p) = expandHome(uHome home "/" cfgFile); + cleanCharP(p) = expandHome(muHome home "/" cfgFile); smallJsont *r = readFileG(cfg, p); if (!r) { logE("Missing %s", p); @@ -306,6 +325,17 @@ void monitor(void) { XFailure; } + // events older than clearTime are not displayed + time_t clearTime = 0; + + // load cleartime + cleanAllocateSmallJson(ct); + cleanCharP(ctPath) = expandHome(muHome home "/" monitorCleartimeFile); + if (isPath(ctPath)) { + readFileG(ct, ctPath); + clearTime = getTopG(ct, rtU64); + } + // setup terminal int R = tb_init(); if (R) { @@ -317,18 +347,12 @@ void monitor(void) { // infinite loop cleanAllocateSmallJson(state); - /* forever { */ - /* pError0(ZEROVAR(buf)); */ - /* socklen_t addr_size = sizeof(client); */ - /* ssize_t r = recvfrom(sock, buf, sizeof(buf), 0, (struct sockaddr *) &client, &addr_size); */ - /* if (r == -1) { */ - /* logE("recvfrom: %s", strerror(errno)); */ - /* continue; */ - /* } */ - /* parseG(state, buf); */ - /* cleanCharP(stateStr) = stringifyG(state, 2); */ - /* puts(stateStr); */ - /* } */ + + tb_stringf(0, 0, TB_WHITE, TB_DEFAULT, "%s", "No state received from logger yet..."); + tb_present(); + + // when the first udp packet arrives, the message above is erased + bool gotAMessage = no; struct tb_event ev; while (tb_poll_event(&ev, sock)) { @@ -338,14 +362,28 @@ void monitor(void) { case TB_KEY_ESC: goto done; } - if (ev.ch == 'q') goto done; - - tb_stringf(2, 30, TB_WHITE, TB_DEFAULT, "key code: %5d, char: %5d, mod: %5d", ev.key, ev.ch, ev.mod); + if (ev.ch == 'q') goto done; + elif (ev.ch == 'c') { + clearTime = getCurrentUnixTime(); + // bug in libsheepy, top can be set only once + freeG(ct); + setTopG(ct, clearTime); + /* // bug in libsheepy, top can be set only once */ + /* smallIntt *v = allocSmallInt(clearTime); */ + /* sFree((smallt *)ct->topI); */ + /* ct->topI = (sIntt *)toSmallt((baset*)v); */ + writeFileG(ct, ctPath); + } break; case TB_EVENT_RESIZE: //draw_all(); break; case TB_EVENT_SOCKET: + if (!gotAMessage) { + // erase default message + tb_stringf(0, 0, TB_WHITE, TB_DEFAULT, "%s", "Agent State Last change Boot Boot time Probes"); + gotAMessage = yes; + } pError0(ZEROVAR(buf)); socklen_t addr_size = sizeof(client); ssize_t r = recvfrom(sock, buf, sizeof(buf), 0, (struct sockaddr *) &client, &addr_size); @@ -354,7 +392,81 @@ void monitor(void) { continue; } parseG(state, buf); - tb_stringf(0, 0, TB_WHITE, TB_DEFAULT, "monitor"); + u16 line = 1; + iter(state, D) { + if (!isOSmallDict(D)) continue; + cast(smallDictt*, d, D); + u16 x = 0; + tb_stringf(x, line, TB_WHITE, TB_DEFAULT, "%20s", iK(state) /*agent name*/); + x += 21; + u32 color, bgcolor; + if (eqG($(d, "state"), "alive")) { + color = TB_WHITE; + bgcolor = TB_BLACK; + } + elif (eqG($(d, "state"), "init")) { + color = TB_GREEN; + bgcolor = TB_BLACK; + } + elif (eqG($(d, "state"), "down")) { + color = TB_WHITE; + bgcolor = TB_RED; + } + else { + color = TB_BLACK; + bgcolor = TB_RED; + } + tb_stringf(x, line, color, bgcolor, "%5s", $(d, "state")); + x += 6; + cleanCharP(lastStateChange) = timeToYMDS(u$(d, "last")); + tb_stringf(x, line, TB_WHITE, TB_DEFAULT, "%19s", lastStateChange); + x += 20; + //tb_stringf(47, line, TB_WHITE, TB_DEFAULT, "%d", u$(d, "mId")); + char *rebootorNet = ""; + cleanCharP(lastBoot) = null; + if (getG(d, rtBool, "rebooted") and u$(d, "lastBoot") > clearTime) { + color = TB_WHITE; + bgcolor = TB_RED; + rebootorNet = "rebooted"; + lastBoot = timeToYMDS(u$(d, "lastBoot")); + } + elif (getG(d, rtBool, "net") and u$(d, "lastNet") > clearTime) { + u64 t = u$(d, "lastNet"); + // check network issues + color = TB_WHITE; + bgcolor = TB_RED; + rebootorNet = "network"; + lastBoot = timeToYMDS(u$(d, "lastNet")); + } + else { + color = TB_WHITE; + bgcolor = TB_BLACK; + lastBoot = timeToYMDS(u$(d, "lastBoot")); + } + tb_stringf(x, line, color, bgcolor, "%8s", rebootorNet); + x += 19; + tb_stringf(x, line, TB_WHITE, TB_DEFAULT, "%19s", lastBoot); + x += 20; + cleanFinishSmallArrayP(probes) = getG(d, rtSmallArrayt, "probes"); + if (probes) { + iter(probes, P) { + cast(smallDictt*,p,P); + if (getG(p, rtBool, "state")) { + // service down + color = TB_WHITE; + bgcolor = TB_RED; + } + else { + // service up + color = TB_WHITE; + bgcolor = TB_BLACK; + } + tb_stringf(x, line, color, bgcolor, "port %5d", u$(p, "port")); + x += 11; + } + } + inc line; + } break; } tb_present(); @@ -768,6 +880,7 @@ void probe(char *cfgfile) { // timeout goto cont; } + // TODO check packet size is correct else { // got a message //lv(r); @@ -806,21 +919,9 @@ void probe(char *cfgfile) { /* logE("Packet from %s with ip %s came too early.", agents[m->id], ip); */ /* } */ /* setG(agent, "mono", time); */ + bool loggerRunning = hasG(agent, "time"); setG(agent, "time", getCurrentUnixTime()); char *newstate = m->messageId ? "alive" : "init"; - if (!eqG($(agent, "state"), newstate)) { - saveEvent = yes; - if (eqG($(agent, "state"), "alive")) { - setG(agent, "rebooted", TRUE); - setG(agent, "lastBoot", getCurrentUnixTime()); - // send mail when agent rebooted - cleanCharP(s) = formatS("%s rebooted", agents[m->id]); - pushG(mailMsg, s); - logW("%s", s); - } - setG(agent, "state", newstate); - setG(agent, "last", getCurrentUnixTime()); - } if (m->messageId < u$(agent,"mId")) { saveEvent = yes; setG(agent, "rebooted", TRUE); @@ -830,7 +931,30 @@ void probe(char *cfgfile) { pushG(mailMsg, s); logW("%s", s); } + elif ( eqG($(agent, "state"), "down") + and eqG(newstate, "alive") + and loggerRunning) { + // there is a network issue when the agent was down + // and messageId is higher than the last received messageId + // (it didn't reboot) + // when starting logger, agent state is down + // if the agents started running before logger, the agent states are alive + // in that case mails are not sent because loggerRunning is false + saveEvent = yes; + setG(agent, "net", TRUE); + setG(agent, "lastNet", getCurrentUnixTime()); + // send mail when agent has network issues + cleanCharP(s) = formatS("%s has network issues", agents[m->id]); + pushG(mailMsg, s); + logW("%s", s); + } setG(agent, "mId", m->messageId); + // update state + if (!eqG($(agent, "state"), newstate)) { + saveEvent = yes; + setG(agent, "state", newstate); + setG(agent, "last", getCurrentUnixTime()); + } cleanFinishSmallArrayP(probes) = getG(agent, rtSmallArrayt, "probes"); if (probes) { iter(probes, P) {