From 139ac87245fd1ca18d60a0efca32b50e4c1d8730 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 23 Jul 2012 14:00:42 -0400 Subject: [PATCH] Fix: Multiple health monitoring fixes * Fix modulo operation bug on #define HEALTH_IS_IN_CODE(x) (x % HEALTH_POLL_VALUE) which is causing the check to think it is never within code. (x % 1 always equals 0). Simplify this by using a simple & on the poll value, and remove the IS_IN_CODE, using ! on IS_IN_POLL instead (which removes nothing to clarity). * Atomic operations should apply to at most "unsigned long" (32-bit on 32-bit arch) rather than uint64_t. * Separate the "error" condition from the counters. We clearly cannot use the "0" value as an error on 32-bit counters anymore, because they can easily wrap. * Introduce "exit" condition, will be useful for state tracking in the future. Error and exit conditions implemented as flags. * Add "APP_MANAGE" in addition to "APP_REG" health check, to monitor the app registration thread (which was missing, only the app manager thread was checked, under the name "APP_REG", which was misleading). * Remove bogus usage of uatomic_xchg() in health_check_state(): It is not needed to update the "last" value, since the last value is read and written to by a single thread. Moreover, this specific use of xchg was not exchanging anything: it was just setting the last value to the "current" one, and doing nothing with the return value. Whatever was expected to be achieved by using uatomic_xchg() clearly wasn't. * Because the health check thread could still be answering a request concurrently sessiond teardown, we need to ensure that all threads only set the "error" condition if they reach teardown paths due to an actual error, not on "normal" teardown condition (thread quit pipe being closed). Flagging threads as being in error condition upon all exit paths would lead to false "errors" sent to the client, which we want to avoid, since the client could then think it needs to kill a sessiond when the sessiond might be in the process of gracefully restarting. Signed-off-by: Mathieu Desnoyers Signed-off-by: David Goulet --- include/lttng/lttng.h | 1 + src/bin/lttng-sessiond/health.c | 30 ++++---- src/bin/lttng-sessiond/health.h | 56 ++++++++++----- src/bin/lttng-sessiond/main.c | 122 ++++++++++++++++++++++---------- 4 files changed, 139 insertions(+), 70 deletions(-) diff --git a/include/lttng/lttng.h b/include/lttng/lttng.h index 6823579a9..7754bcdbc 100644 --- a/include/lttng/lttng.h +++ b/include/lttng/lttng.h @@ -134,6 +134,7 @@ enum lttng_calibrate_type { /* Health component for the health check function. */ enum lttng_health_component { LTTNG_HEALTH_CMD, + LTTNG_HEALTH_APP_MANAGE, LTTNG_HEALTH_APP_REG, LTTNG_HEALTH_KERNEL, LTTNG_HEALTH_CONSUMER, diff --git a/src/bin/lttng-sessiond/health.c b/src/bin/lttng-sessiond/health.c index 58f804eb9..b9a3ba56a 100644 --- a/src/bin/lttng-sessiond/health.c +++ b/src/bin/lttng-sessiond/health.c @@ -32,33 +32,33 @@ */ int health_check_state(struct health_state *state) { - int ret; - uint64_t current; - uint64_t last; + unsigned long current, last; + int ret = 1; assert(state); + last = state->last; current = uatomic_read(&state->current); - last = uatomic_read(&state->last); /* - * Here are the conditions for a bad health. Current state set to 0 or the - * current state is the same as the last one and we are NOT waiting for a - * poll() call. + * Here are the conditions for a bad health. Either flag HEALTH_ERROR is + * set, or the progress counter is the same as the last one and we are NOT + * waiting for a poll() call. */ - if (current == 0 || (current == last && HEALTH_IS_IN_CODE(current))) { + if ((uatomic_read(&state->flags) & HEALTH_ERROR) || + (current == last && !HEALTH_IS_IN_POLL(current))) { + /* error */ ret = 0; - goto error; } - /* All good */ - ret = 1; - -error: DBG("Health state current %" PRIu64 ", last %" PRIu64 ", ret %d", current, last, ret); - /* Exchange current state counter into last one */ - uatomic_xchg(&state->last, state->current); + /* + * Update last counter. This value is and MUST be access only in this + * function. + */ + state->last = current; + return ret; } diff --git a/src/bin/lttng-sessiond/health.h b/src/bin/lttng-sessiond/health.h index 9a1ef391a..b26886015 100644 --- a/src/bin/lttng-sessiond/health.h +++ b/src/bin/lttng-sessiond/health.h @@ -25,20 +25,35 @@ * These are the value added to the current state depending of the position in * the thread where is either waiting on a poll() or running in the code. */ -#define HEALTH_POLL_VALUE 1 -#define HEALTH_CODE_VALUE 2 +#define HEALTH_POLL_VALUE (1UL << 0) +#define HEALTH_CODE_VALUE (1UL << 1) -#define HEALTH_IS_IN_POLL(x) (x % HEALTH_CODE_VALUE) -#define HEALTH_IS_IN_CODE(x) (x % HEALTH_POLL_VALUE) +#define HEALTH_IS_IN_POLL(x) ((x) & HEALTH_POLL_VALUE) + +enum health_flags { + HEALTH_EXIT = (1U << 0), + HEALTH_ERROR = (1U << 1), +}; struct health_state { - uint64_t last; - uint64_t current; + /* + * last counter is only read and updated by the health_check + * thread (single updater). + */ + unsigned long last; + /* + * current and flags are updated by multiple threads concurrently. + */ + unsigned long current; /* progress counter, updated atomically */ + enum health_flags flags; /* other flags, updated atomically */ }; /* Health state counters for the client command thread */ extern struct health_state health_thread_cmd; +/* Health state counters for the application management thread */ +extern struct health_state health_thread_app_manage; + /* Health state counters for the application registration thread */ extern struct health_state health_thread_app_reg; @@ -46,36 +61,41 @@ extern struct health_state health_thread_app_reg; extern struct health_state health_thread_kernel; /* - * Update current counter by 1 to indicate that the thread is in a blocking - * state cause by a poll(). + * Update current counter by 1 to indicate that the thread entered or + * left a blocking state caused by a poll(). */ static inline void health_poll_update(struct health_state *state) { assert(state); - uatomic_add(&state->current, HEALTH_POLL_VALUE); } /* - * Update current counter by 2 which indicates that we are currently running in - * a thread and NOT blocked at a poll(). + * Update current counter by 2 indicates progress in execution of a + * thread. */ static inline void health_code_update(struct health_state *state) { assert(state); - uatomic_add(&state->current, HEALTH_CODE_VALUE); } /* - * Reset health state. A value of zero indicate a bad health state. + * Set health "exit" flag. */ -static inline void health_reset(struct health_state *state) +static inline void health_exit(struct health_state *state) { assert(state); + uatomic_or(&state->flags, HEALTH_EXIT); +} - uatomic_set(&state->current, 0); - uatomic_set(&state->last, 0); +/* + * Set health "error" flag. + */ +static inline void health_error(struct health_state *state) +{ + assert(state); + uatomic_or(&state->flags, HEALTH_ERROR); } /* @@ -84,9 +104,9 @@ static inline void health_reset(struct health_state *state) static inline void health_init(struct health_state *state) { assert(state); - uatomic_set(&state->last, 0); - uatomic_set(&state->current, HEALTH_CODE_VALUE); + uatomic_set(&state->current, 0); + uatomic_set(&state->flags, 0); } int health_check_state(struct health_state *state); diff --git a/src/bin/lttng-sessiond/main.c b/src/bin/lttng-sessiond/main.c index 5664b0be1..f6d560c33 100644 --- a/src/bin/lttng-sessiond/main.c +++ b/src/bin/lttng-sessiond/main.c @@ -216,6 +216,7 @@ static unsigned int relayd_net_seq_idx; /* Used for the health monitoring of the session daemon. See health.h */ struct health_state health_thread_cmd; +struct health_state health_thread_app_manage; struct health_state health_thread_app_reg; struct health_state health_thread_kernel; @@ -716,7 +717,7 @@ static void update_ust_app(int app_sock) */ static void *thread_manage_kernel(void *data) { - int ret, i, pollfd, update_poll_flag = 1; + int ret, i, pollfd, update_poll_flag = 1, err = -1; uint32_t revents, nb_fd; char tmp; struct lttng_poll_event events; @@ -789,7 +790,8 @@ static void *thread_manage_kernel(void *data) /* Thread quit pipe has been closed. Killing thread. */ ret = check_thread_quit_pipe(pollfd, revents); if (ret) { - goto error; + err = 0; + goto exit; } /* Check for data on kernel pipe */ @@ -817,10 +819,15 @@ static void *thread_manage_kernel(void *data) } } +exit: error: lttng_poll_clean(&events); error_poll_create: - health_reset(&health_thread_kernel); + if (err) { + health_error(&health_thread_kernel); + ERR("Health error occurred in %s", __func__); + } + health_exit(&health_thread_kernel); DBG("Kernel thread dying"); return NULL; } @@ -830,7 +837,7 @@ error_poll_create: */ static void *thread_manage_consumer(void *data) { - int sock = -1, i, ret, pollfd; + int sock = -1, i, ret, pollfd, err = -1; uint32_t revents, nb_fd; enum lttcomm_return_code code; struct lttng_poll_event events; @@ -888,7 +895,8 @@ restart: /* Thread quit pipe has been closed. Killing thread. */ ret = check_thread_quit_pipe(pollfd, revents); if (ret) { - goto error; + err = 0; + goto exit; } /* Event on the registration socket */ @@ -976,7 +984,8 @@ restart_poll: /* Thread quit pipe has been closed. Killing thread. */ ret = check_thread_quit_pipe(pollfd, revents); if (ret) { - goto error; + err = 0; + goto exit; } /* Event on the kconsumerd socket */ @@ -1000,6 +1009,7 @@ restart_poll: ERR("consumer return code : %s", lttcomm_get_readable_code(-code)); +exit: error: /* Immediately set the consumerd state to stopped */ if (consumer_data->type == LTTNG_CONSUMER_KERNEL) { @@ -1038,7 +1048,11 @@ error: lttng_poll_clean(&events); error_poll: error_listen: - health_reset(&consumer_data->health); + if (err) { + health_error(&consumer_data->health); + ERR("Health error occurred in %s", __func__); + } + health_exit(&consumer_data->health); DBG("consumer thread cleanup completed"); return NULL; @@ -1049,7 +1063,7 @@ error_listen: */ static void *thread_manage_apps(void *data) { - int i, ret, pollfd; + int i, ret, pollfd, err = -1; uint32_t revents, nb_fd; struct ust_command ust_cmd; struct lttng_poll_event events; @@ -1059,7 +1073,7 @@ static void *thread_manage_apps(void *data) rcu_register_thread(); rcu_thread_online(); - health_code_update(&health_thread_app_reg); + health_code_update(&health_thread_app_manage); ret = create_thread_poll_set(&events, 2); if (ret < 0) { @@ -1071,7 +1085,7 @@ static void *thread_manage_apps(void *data) goto error; } - health_code_update(&health_thread_app_reg); + health_code_update(&health_thread_app_manage); while (1) { /* Zeroed the events structure */ @@ -1083,9 +1097,9 @@ static void *thread_manage_apps(void *data) /* Inifinite blocking call, waiting for transmission */ restart: - health_poll_update(&health_thread_app_reg); + health_poll_update(&health_thread_app_manage); ret = lttng_poll_wait(&events, -1); - health_poll_update(&health_thread_app_reg); + health_poll_update(&health_thread_app_manage); if (ret < 0) { /* * Restart interrupted system call. @@ -1101,12 +1115,13 @@ static void *thread_manage_apps(void *data) revents = LTTNG_POLL_GETEV(&events, i); pollfd = LTTNG_POLL_GETFD(&events, i); - health_code_update(&health_thread_app_reg); + health_code_update(&health_thread_app_manage); /* Thread quit pipe has been closed. Killing thread. */ ret = check_thread_quit_pipe(pollfd, revents); if (ret) { - goto error; + err = 0; + goto exit; } /* Inspect the apps cmd pipe */ @@ -1122,7 +1137,7 @@ static void *thread_manage_apps(void *data) goto error; } - health_code_update(&health_thread_app_reg); + health_code_update(&health_thread_app_manage); /* Register applicaton to the session daemon */ ret = ust_app_register(&ust_cmd.reg_msg, @@ -1133,7 +1148,7 @@ static void *thread_manage_apps(void *data) break; } - health_code_update(&health_thread_app_reg); + health_code_update(&health_thread_app_manage); /* * Validate UST version compatibility. @@ -1147,7 +1162,7 @@ static void *thread_manage_apps(void *data) update_ust_app(ust_cmd.sock); } - health_code_update(&health_thread_app_reg); + health_code_update(&health_thread_app_manage); ret = ust_app_register_done(ust_cmd.sock); if (ret < 0) { @@ -1172,7 +1187,7 @@ static void *thread_manage_apps(void *data) ust_cmd.sock); } - health_code_update(&health_thread_app_reg); + health_code_update(&health_thread_app_manage); break; } @@ -1194,14 +1209,19 @@ static void *thread_manage_apps(void *data) } } - health_code_update(&health_thread_app_reg); + health_code_update(&health_thread_app_manage); } } +exit: error: lttng_poll_clean(&events); error_poll_create: - health_reset(&health_thread_app_reg); + if (err) { + health_error(&health_thread_app_manage); + ERR("Health error occurred in %s", __func__); + } + health_exit(&health_thread_app_manage); DBG("Application communication apps thread cleanup complete"); rcu_thread_offline(); rcu_unregister_thread(); @@ -1277,7 +1297,7 @@ error: */ static void *thread_registration_apps(void *data) { - int sock = -1, i, ret, pollfd; + int sock = -1, i, ret, pollfd, err = -1; uint32_t revents, nb_fd; struct lttng_poll_event events; /* @@ -1323,7 +1343,9 @@ static void *thread_registration_apps(void *data) /* Inifinite blocking call, waiting for transmission */ restart: + health_poll_update(&health_thread_app_reg); ret = lttng_poll_wait(&events, -1); + health_poll_update(&health_thread_app_reg); if (ret < 0) { /* * Restart interrupted system call. @@ -1335,6 +1357,8 @@ static void *thread_registration_apps(void *data) } for (i = 0; i < nb_fd; i++) { + health_code_update(&health_thread_app_reg); + /* Fetch once the poll data */ revents = LTTNG_POLL_GETEV(&events, i); pollfd = LTTNG_POLL_GETFD(&events, i); @@ -1342,7 +1366,8 @@ static void *thread_registration_apps(void *data) /* Thread quit pipe has been closed. Killing thread. */ ret = check_thread_quit_pipe(pollfd, revents); if (ret) { - goto error; + err = 0; + goto exit; } /* Event on the registration socket */ @@ -1378,6 +1403,7 @@ static void *thread_registration_apps(void *data) sock = -1; continue; } + health_code_update(&health_thread_app_reg); ret = lttcomm_recv_unix_sock(sock, &ust_cmd->reg_msg, sizeof(struct ust_register_msg)); if (ret < 0 || ret < sizeof(struct ust_register_msg)) { @@ -1395,6 +1421,7 @@ static void *thread_registration_apps(void *data) sock = -1; continue; } + health_code_update(&health_thread_app_reg); ust_cmd->sock = sock; sock = -1; @@ -1422,7 +1449,14 @@ static void *thread_registration_apps(void *data) } } +exit: error: + if (err) { + health_error(&health_thread_app_reg); + ERR("Health error occurred in %s", __func__); + } + health_exit(&health_thread_app_reg); + /* Notify that the registration thread is gone */ notify_ust_apps(0); @@ -1742,15 +1776,15 @@ error: } /* - * Compute health status of each consumer. + * Compute health status of each consumer. If one of them is zero (bad + * state), we return 0. */ static int check_consumer_health(void) { int ret; - ret = - health_check_state(&kconsumer_data.health) & - health_check_state(&ustconsumer32_data.health) & + ret = health_check_state(&kconsumer_data.health) && + health_check_state(&ustconsumer32_data.health) && health_check_state(&ustconsumer64_data.health); DBG3("Health consumer check %d", ret); @@ -4627,7 +4661,7 @@ init_setup_error: */ static void *thread_manage_health(void *data) { - int sock = -1, new_sock, ret, i, pollfd; + int sock = -1, new_sock, ret, i, pollfd, err = -1; uint32_t revents, nb_fd; struct lttng_poll_event events; struct lttcomm_health_msg msg; @@ -4691,7 +4725,8 @@ restart: /* Thread quit pipe has been closed. Killing thread. */ ret = check_thread_quit_pipe(pollfd, revents); if (ret) { - goto error; + err = 0; + goto exit; } /* Event on the registration socket */ @@ -4726,6 +4761,9 @@ restart: case LTTNG_HEALTH_CMD: reply.ret_code = health_check_state(&health_thread_cmd); break; + case LTTNG_HEALTH_APP_MANAGE: + reply.ret_code = health_check_state(&health_thread_app_manage); + break; case LTTNG_HEALTH_APP_REG: reply.ret_code = health_check_state(&health_thread_app_reg); break; @@ -4736,13 +4774,12 @@ restart: reply.ret_code = check_consumer_health(); break; case LTTNG_HEALTH_ALL: - ret = check_consumer_health(); - reply.ret_code = - health_check_state(&health_thread_app_reg) & - health_check_state(&health_thread_cmd) & - health_check_state(&health_thread_kernel) & - ret; + health_check_state(&health_thread_app_manage) && + health_check_state(&health_thread_app_reg) && + health_check_state(&health_thread_cmd) && + health_check_state(&health_thread_kernel) && + check_consumer_health(); break; default: reply.ret_code = LTTCOMM_UND; @@ -4774,7 +4811,11 @@ restart: new_sock = -1; } +exit: error: + if (err) { + ERR("Health error occurred in %s", __func__); + } DBG("Health check thread dying"); unlink(health_unix_sock_path); if (sock >= 0) { @@ -4802,7 +4843,7 @@ error: */ static void *thread_manage_clients(void *data) { - int sock = -1, ret, i, pollfd; + int sock = -1, ret, i, pollfd, err = -1; int sock_error; uint32_t revents, nb_fd; struct command_ctx *cmd_ctx = NULL; @@ -4873,7 +4914,8 @@ static void *thread_manage_clients(void *data) /* Thread quit pipe has been closed. Killing thread. */ ret = check_thread_quit_pipe(pollfd, revents); if (ret) { - goto error; + err = 0; + goto exit; } /* Event on the registration socket */ @@ -4993,8 +5035,13 @@ static void *thread_manage_clients(void *data) health_code_update(&health_thread_cmd); } +exit: error: - health_reset(&health_thread_cmd); + if (err) { + health_error(&health_thread_cmd); + ERR("Health error occurred in %s", __func__); + } + health_exit(&health_thread_cmd); DBG("Client thread dying"); unlink(client_unix_sock_path); @@ -5740,6 +5787,7 @@ int main(int argc, char **argv) /* Init all health thread counters. */ health_init(&health_thread_cmd); health_init(&health_thread_kernel); + health_init(&health_thread_app_manage); health_init(&health_thread_app_reg); /* -- 2.34.1