commit 063884af53ec02906c8e7726468b234276eef3ca
parent 78ce02cda96526ba8de587dfcfe7e69d73d5c320
Author: Remy Noulin <loader2x@gmail.com>
Date: Mon, 10 Jul 2023 14:09:57 +0200
improve monitor and the logic for detecting reboots and network issues
heartbeat.c | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 155 insertions(+), 31 deletions(-)
Diffstat:
| M | heartbeat.c | | | 186 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------- |
1 file changed, 155 insertions(+), 31 deletions(-)
diff --git a/heartbeat.c b/heartbeat.c
@@ -18,6 +18,11 @@ config: generate configuration yml files for each machine
copy heartbeat executable
daemon: run as daemon
monitor: display state
+
+
+msgt is the packets sent with udp from the agents
+dbt is the biggest possible packet, it is written in the db files when there is a state
+change (event).
*/
@@ -25,7 +30,15 @@ monitor: display state
// it used in the config command to generate agentCfgFile for each machine
#define cfgFile "heartbeatConfig.yml"
+// cleartime is used to ignore events that happened before cleartime
+// press c to set cleartime to now
+#define monitorCleartimeFile "cleartime.json"
+
+// when running as a daemon, expanding ~/ doesn't work
#define uHome "/root/"
+// monitor user home, monitor doesn't run as a daemon and
+// can run as a normal user
+#define muHome "~/"
#define home ".heartbeat"
// agentCfgFile is the default configuration file
@@ -38,7 +51,7 @@ monitor: display state
// TODO #define defaultPeriod 120
// 10mn timeout
-#define agentTimeOut 5
+#define agentTimeOut 8
//TODO #define agentTimeOut 600
int argc; char **argv;
@@ -48,9 +61,13 @@ int argc; char **argv;
/* #define pLog(...) */
void printHelp(void);
+// generate configurations and setup script
void config(smallJsont *cfg);
+// run commands with ssh on agents
void runcommand(void);
+// show state
void monitor(void);
+// send state to logger, as logger collect packets from agents
void probe(char *cfgfile);
/* process arguments on command line
@@ -261,11 +278,13 @@ void monitor(void) {
// load config
// find logger config
// create monitor socket
+ // events older than clearTime are not displayed
+ // load cleartime
// setup terminal
// infinite loop
cleanAllocateSmallJson(cfg);
- cleanCharP(p) = expandHome(uHome home "/" cfgFile);
+ cleanCharP(p) = expandHome(muHome home "/" cfgFile);
smallJsont *r = readFileG(cfg, p);
if (!r) {
logE("Missing %s", p);
@@ -306,6 +325,17 @@ void monitor(void) {
XFailure;
}
+ // events older than clearTime are not displayed
+ time_t clearTime = 0;
+
+ // load cleartime
+ cleanAllocateSmallJson(ct);
+ cleanCharP(ctPath) = expandHome(muHome home "/" monitorCleartimeFile);
+ if (isPath(ctPath)) {
+ readFileG(ct, ctPath);
+ clearTime = getTopG(ct, rtU64);
+ }
+
// setup terminal
int R = tb_init();
if (R) {
@@ -317,18 +347,12 @@ void monitor(void) {
// infinite loop
cleanAllocateSmallJson(state);
- /* forever { */
- /* pError0(ZEROVAR(buf)); */
- /* socklen_t addr_size = sizeof(client); */
- /* ssize_t r = recvfrom(sock, buf, sizeof(buf), 0, (struct sockaddr *) &client, &addr_size); */
- /* if (r == -1) { */
- /* logE("recvfrom: %s", strerror(errno)); */
- /* continue; */
- /* } */
- /* parseG(state, buf); */
- /* cleanCharP(stateStr) = stringifyG(state, 2); */
- /* puts(stateStr); */
- /* } */
+
+ tb_stringf(0, 0, TB_WHITE, TB_DEFAULT, "%s", "No state received from logger yet...");
+ tb_present();
+
+ // when the first udp packet arrives, the message above is erased
+ bool gotAMessage = no;
struct tb_event ev;
while (tb_poll_event(&ev, sock)) {
@@ -338,14 +362,28 @@ void monitor(void) {
case TB_KEY_ESC:
goto done;
}
- if (ev.ch == 'q') goto done;
-
- tb_stringf(2, 30, TB_WHITE, TB_DEFAULT, "key code: %5d, char: %5d, mod: %5d", ev.key, ev.ch, ev.mod);
+ if (ev.ch == 'q') goto done;
+ elif (ev.ch == 'c') {
+ clearTime = getCurrentUnixTime();
+ // bug in libsheepy, top can be set only once
+ freeG(ct);
+ setTopG(ct, clearTime);
+ /* // bug in libsheepy, top can be set only once */
+ /* smallIntt *v = allocSmallInt(clearTime); */
+ /* sFree((smallt *)ct->topI); */
+ /* ct->topI = (sIntt *)toSmallt((baset*)v); */
+ writeFileG(ct, ctPath);
+ }
break;
case TB_EVENT_RESIZE:
//draw_all();
break;
case TB_EVENT_SOCKET:
+ if (!gotAMessage) {
+ // erase default message
+ tb_stringf(0, 0, TB_WHITE, TB_DEFAULT, "%s", "Agent State Last change Boot Boot time Probes");
+ gotAMessage = yes;
+ }
pError0(ZEROVAR(buf));
socklen_t addr_size = sizeof(client);
ssize_t r = recvfrom(sock, buf, sizeof(buf), 0, (struct sockaddr *) &client, &addr_size);
@@ -354,7 +392,81 @@ void monitor(void) {
continue;
}
parseG(state, buf);
- tb_stringf(0, 0, TB_WHITE, TB_DEFAULT, "monitor");
+ u16 line = 1;
+ iter(state, D) {
+ if (!isOSmallDict(D)) continue;
+ cast(smallDictt*, d, D);
+ u16 x = 0;
+ tb_stringf(x, line, TB_WHITE, TB_DEFAULT, "%20s", iK(state) /*agent name*/);
+ x += 21;
+ u32 color, bgcolor;
+ if (eqG($(d, "state"), "alive")) {
+ color = TB_WHITE;
+ bgcolor = TB_BLACK;
+ }
+ elif (eqG($(d, "state"), "init")) {
+ color = TB_GREEN;
+ bgcolor = TB_BLACK;
+ }
+ elif (eqG($(d, "state"), "down")) {
+ color = TB_WHITE;
+ bgcolor = TB_RED;
+ }
+ else {
+ color = TB_BLACK;
+ bgcolor = TB_RED;
+ }
+ tb_stringf(x, line, color, bgcolor, "%5s", $(d, "state"));
+ x += 6;
+ cleanCharP(lastStateChange) = timeToYMDS(u$(d, "last"));
+ tb_stringf(x, line, TB_WHITE, TB_DEFAULT, "%19s", lastStateChange);
+ x += 20;
+ //tb_stringf(47, line, TB_WHITE, TB_DEFAULT, "%d", u$(d, "mId"));
+ char *rebootorNet = "";
+ cleanCharP(lastBoot) = null;
+ if (getG(d, rtBool, "rebooted") and u$(d, "lastBoot") > clearTime) {
+ color = TB_WHITE;
+ bgcolor = TB_RED;
+ rebootorNet = "rebooted";
+ lastBoot = timeToYMDS(u$(d, "lastBoot"));
+ }
+ elif (getG(d, rtBool, "net") and u$(d, "lastNet") > clearTime) {
+ u64 t = u$(d, "lastNet");
+ // check network issues
+ color = TB_WHITE;
+ bgcolor = TB_RED;
+ rebootorNet = "network";
+ lastBoot = timeToYMDS(u$(d, "lastNet"));
+ }
+ else {
+ color = TB_WHITE;
+ bgcolor = TB_BLACK;
+ lastBoot = timeToYMDS(u$(d, "lastBoot"));
+ }
+ tb_stringf(x, line, color, bgcolor, "%8s", rebootorNet);
+ x += 19;
+ tb_stringf(x, line, TB_WHITE, TB_DEFAULT, "%19s", lastBoot);
+ x += 20;
+ cleanFinishSmallArrayP(probes) = getG(d, rtSmallArrayt, "probes");
+ if (probes) {
+ iter(probes, P) {
+ cast(smallDictt*,p,P);
+ if (getG(p, rtBool, "state")) {
+ // service down
+ color = TB_WHITE;
+ bgcolor = TB_RED;
+ }
+ else {
+ // service up
+ color = TB_WHITE;
+ bgcolor = TB_BLACK;
+ }
+ tb_stringf(x, line, color, bgcolor, "port %5d", u$(p, "port"));
+ x += 11;
+ }
+ }
+ inc line;
+ }
break;
}
tb_present();
@@ -768,6 +880,7 @@ void probe(char *cfgfile) {
// timeout
goto cont;
}
+ // TODO check packet size is correct
else {
// got a message
//lv(r);
@@ -806,21 +919,9 @@ void probe(char *cfgfile) {
/* logE("Packet from %s with ip %s came too early.", agents[m->id], ip); */
/* } */
/* setG(agent, "mono", time); */
+ bool loggerRunning = hasG(agent, "time");
setG(agent, "time", getCurrentUnixTime());
char *newstate = m->messageId ? "alive" : "init";
- if (!eqG($(agent, "state"), newstate)) {
- saveEvent = yes;
- if (eqG($(agent, "state"), "alive")) {
- setG(agent, "rebooted", TRUE);
- setG(agent, "lastBoot", getCurrentUnixTime());
- // send mail when agent rebooted
- cleanCharP(s) = formatS("%s rebooted", agents[m->id]);
- pushG(mailMsg, s);
- logW("%s", s);
- }
- setG(agent, "state", newstate);
- setG(agent, "last", getCurrentUnixTime());
- }
if (m->messageId < u$(agent,"mId")) {
saveEvent = yes;
setG(agent, "rebooted", TRUE);
@@ -830,7 +931,30 @@ void probe(char *cfgfile) {
pushG(mailMsg, s);
logW("%s", s);
}
+ elif ( eqG($(agent, "state"), "down")
+ and eqG(newstate, "alive")
+ and loggerRunning) {
+ // there is a network issue when the agent was down
+ // and messageId is higher than the last received messageId
+ // (it didn't reboot)
+ // when starting logger, agent state is down
+ // if the agents started running before logger, the agent states are alive
+ // in that case mails are not sent because loggerRunning is false
+ saveEvent = yes;
+ setG(agent, "net", TRUE);
+ setG(agent, "lastNet", getCurrentUnixTime());
+ // send mail when agent has network issues
+ cleanCharP(s) = formatS("%s has network issues", agents[m->id]);
+ pushG(mailMsg, s);
+ logW("%s", s);
+ }
setG(agent, "mId", m->messageId);
+ // update state
+ if (!eqG($(agent, "state"), newstate)) {
+ saveEvent = yes;
+ setG(agent, "state", newstate);
+ setG(agent, "last", getCurrentUnixTime());
+ }
cleanFinishSmallArrayP(probes) = getG(agent, rtSmallArrayt, "probes");
if (probes) {
iter(probes, P) {