Fix: health subsystem issues with shared code

[lttng-tools.git] / src / bin / lttng-sessiond / health.c
diff --git a/src/bin/lttng-sessiond/health.c b/src/bin/lttng-sessiond/health.c

index b9a3ba56aa49f25b4605bcdab70c256db208394f..b36ddc4a337b6bc6c5aa389d45c004a0c3be9580 100644 (file)
--- a/src/bin/lttng-sessiond/health.c
+++ b/src/bin/lttng-sessiond/health.c
@@ -20,45 +20,234 @@
  #include <inttypes.h>
  #include <stdio.h>
  #include <stdlib.h>
+#include <time.h>
  
+#include <common/defaults.h>
  #include <common/error.h>
  
  #include "health.h"
  
+static const struct timespec time_delta = {
+       .tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S,
+       .tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS,
+};
+
+/* Define TLS health state. */
+DEFINE_URCU_TLS(struct health_state, health_state);
+
+/*
+ * It ensures that TLS memory used for the node and its container structure
+ * don't get reclaimed after the TLS owner thread exits until we have finished
+ * using it.
+ */
+static pthread_mutex_t health_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static struct health_tls_state_list health_state_list = {
+       .head = CDS_LIST_HEAD_INIT(health_state_list.head),
+};
+
+/*
+ * This keeps track of the error state for unregistered thread. A thread
+ * reporting a health error, normally unregisters and quits. This makes the TLS
+ * health state not available to the health_check_state() call so on unregister
+ * we update this global error array so we can keep track of which thread was
+ * on error if the TLS health state has been removed.
+ */
+static enum health_flags global_error_state[HEALTH_NUM_TYPE];
+
+/*
+ * Lock health state global list mutex.
+ */
+static void state_lock(void)
+{
+       pthread_mutex_lock(&health_mutex);
+}
+
+/*
+ * Unlock health state global list mutex.
+ */
+static void state_unlock(void)
+{
+       pthread_mutex_unlock(&health_mutex);
+}
+
+/*
+ * Set time difference in res from time_a and time_b.
+ */
+static void time_diff(const struct timespec *time_a,
+               const struct timespec *time_b, struct timespec *res)
+{
+       if (time_a->tv_nsec - time_b->tv_nsec < 0) {
+               res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
+               res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
+       } else {
+               res->tv_sec = time_a->tv_sec - time_b->tv_sec;
+               res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
+       }
+}
+
+/*
+ * Return true if time_a - time_b > diff, else false.
+ */
+static int time_diff_gt(const struct timespec *time_a,
+               const struct timespec *time_b, const struct timespec *diff)
+{
+       struct timespec res;
+
+       time_diff(time_a, time_b, &res);
+       time_diff(&res, diff, &res);
+
+       if (res.tv_sec > 0) {
+               return 1;
+       } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
+               return 1;
+       }
+
+       return 0;
+}
+
+/*
+ * Health mutex MUST be held across use of the returned struct health_state to
+ * provide existence guarantee.
+ *
+ * Return the health_state object or NULL if not found.
+ */
+static struct health_state *find_health_state(enum health_type type)
+{
+       struct health_state *state;
+
+       /* Find the right health state in the global TLS list. */
+       cds_list_for_each_entry(state, &health_state_list.head, node) {
+               if (state->type == type) {
+                       return state;
+               }
+       }
+
+       return NULL;
+}
+
  /*
- * Check health of a specific health state counter.
+ * Check health of a specific health type. Note that if a thread has not yet
+ * initialize its health subsystem or has quit, it's considered in a good
+ * state.
   *
   * Return 0 if health is bad or else 1.
   */
-int health_check_state(struct health_state *state)
+int health_check_state(enum health_type type)
  {
+       int retval = 1, ret;
         unsigned long current, last;
-       int ret = 1;
+       struct timespec current_time;
+       struct health_state *state;
+
+       assert(type < HEALTH_NUM_TYPE);
  
-       assert(state);
+       state_lock();
+
+       state = find_health_state(type);
+       if (!state) {
+               /* Check the global state since the state is not visiable anymore. */
+               if (global_error_state[type] & HEALTH_ERROR) {
+                       retval = 0;
+               }
+               goto not_found;
+       }
  
         last = state->last;
         current = uatomic_read(&state->current);
  
+       ret = clock_gettime(CLOCK_MONOTONIC, &current_time);
+       if (ret < 0) {
+               PERROR("Error reading time\n");
+               /* error */
+               retval = 0;
+               goto end;
+       }
+
         /*
-        * Here are the conditions for a bad health. Either flag HEALTH_ERROR is
-        * set, or the progress counter is the same as the last one and we are NOT
-        * waiting for a poll() call.
+        * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
+        * health if, after the delta delay has passed, its the progress counter
+        * has not moved and it has NOT been waiting for a poll() call.
          */
-       if ((uatomic_read(&state->flags) & HEALTH_ERROR) ||
-                       (current == last && !HEALTH_IS_IN_POLL(current))) {
-               /* error */
-               ret = 0;
+       if (uatomic_read(&state->flags) & HEALTH_ERROR) {
+               retval = 0;
+               goto end;
         }
  
-       DBG("Health state current %" PRIu64 ", last %" PRIu64 ", ret %d",
+       /*
+        * Initial condition need to update the last counter and sample time, but
+        * should not check health in this initial case, because we don't know how
+        * much time has passed.
+        */
+       if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
+               /* update last counter and last sample time */
+               state->last = current;
+               memcpy(&state->last_time, &current_time, sizeof(current_time));
+       } else {
+               if (time_diff_gt(&current_time, &state->last_time, &time_delta)) {
+                       if (current == last && !HEALTH_IS_IN_POLL(current)) {
+                               /* error */
+                               retval = 0;
+                       }
+                       /* update last counter and last sample time */
+                       state->last = current;
+                       memcpy(&state->last_time, &current_time, sizeof(current_time));
+               }
+       }
+
+end:
+       DBG("Health state current %lu, last %lu, ret %d",
                         current, last, ret);
+not_found:
+       state_unlock();
+
+       return retval;
+}
+
+/*
+ * Init health state.
+ */
+void health_register(enum health_type type)
+{
+       struct health_state *state;
+
+       assert(type < HEALTH_NUM_TYPE);
+
+       /* Init TLS state. */
+       uatomic_set(&URCU_TLS(health_state).last, 0);
+       uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
+       uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
+       uatomic_set(&URCU_TLS(health_state).current, 0);
+       uatomic_set(&URCU_TLS(health_state).flags, 0);
+       uatomic_set(&URCU_TLS(health_state).type, type);
  
+       /* Add it to the global TLS state list. */
+       state_lock();
+       state = find_health_state(type);
         /*
-        * Update last counter. This value is and MUST be access only in this
-        * function.
+        * Duplicates are not accepted, since lookups don't handle them at the
+        * moment.
          */
-       state->last = current;
+       assert(!state);
  
-       return ret;
+       cds_list_add(&URCU_TLS(health_state).node, &health_state_list.head);
+       state_unlock();
+}
+
+/*
+ * Remove node from global list.
+ */
+void health_unregister(void)
+{
+       state_lock();
+       /*
+        * On error, set the global_error_state since we are about to remove
+        * the node from the global list.
+        */
+       if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
+               uatomic_set(&global_error_state[URCU_TLS(health_state).type],
+                               HEALTH_ERROR);
+       }
+       cds_list_del(&URCU_TLS(health_state).node);
+       state_unlock();
  }