Fix: support duplicate health type
[lttng-tools.git] / src / bin / lttng-sessiond / health.c
index 6c4de9430793fa1d8508c06dab17e88af0b6bcae..dcf3b96e2b48c7fc4030005818d2bb92b4dda71a 100644 (file)
@@ -32,6 +32,45 @@ static const struct timespec time_delta = {
        .tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS,
 };
 
+/* Define TLS health state. */
+DEFINE_URCU_TLS(struct health_state, health_state);
+
+/*
+ * It ensures that TLS memory used for the node and its container structure
+ * don't get reclaimed after the TLS owner thread exits until we have finished
+ * using it.
+ */
+static pthread_mutex_t health_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static struct health_tls_state_list health_state_list = {
+       .head = CDS_LIST_HEAD_INIT(health_state_list.head),
+};
+
+/*
+ * This keeps track of the error state for unregistered thread. A thread
+ * reporting a health error, normally unregisters and quits. This makes the TLS
+ * health state not available to the health_check_state() call so on unregister
+ * we update this global error array so we can keep track of which thread was
+ * on error if the TLS health state has been removed.
+ */
+static enum health_flags global_error_state[HEALTH_NUM_TYPE];
+
+/*
+ * Lock health state global list mutex.
+ */
+static void state_lock(void)
+{
+       pthread_mutex_lock(&health_mutex);
+}
+
+/*
+ * Unlock health state global list mutex.
+ */
+static void state_unlock(void)
+{
+       pthread_mutex_unlock(&health_mutex);
+}
+
 /*
  * Set time difference in res from time_a and time_b.
  */
@@ -43,7 +82,7 @@ static void time_diff(const struct timespec *time_a,
                res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
        } else {
                res->tv_sec = time_a->tv_sec - time_b->tv_sec;
-               res->tv_nsec = time_a->tv_sec - time_b->tv_sec;
+               res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
        }
 }
 
@@ -68,11 +107,11 @@ static int time_diff_gt(const struct timespec *time_a,
 }
 
 /*
- * Check health of a specific health state counter.
+ * Validate health state. Checks for the error flag or health conditions.
  *
  * Return 0 if health is bad or else 1.
  */
-int health_check_state(struct health_state *state)
+static int validate_state(struct health_state *state)
 {
        int retval = 1, ret;
        unsigned long current, last;
@@ -84,7 +123,7 @@ int health_check_state(struct health_state *state)
        current = uatomic_read(&state->current);
 
        ret = clock_gettime(CLOCK_MONOTONIC, &current_time);
-       if (ret) {
+       if (ret < 0) {
                PERROR("Error reading time\n");
                /* error */
                retval = 0;
@@ -119,12 +158,98 @@ int health_check_state(struct health_state *state)
                        /* update last counter and last sample time */
                        state->last = current;
                        memcpy(&state->last_time, &current_time, sizeof(current_time));
+
+                       /* On error, stop right now and notify caller. */
+                       if (retval == 0) {
+                               goto end;
+                       }
                }
        }
 
 end:
-       DBG("Health state current %" PRIu64 ", last %" PRIu64 ", ret %d",
+       DBG("Health state current %lu, last %lu, ret %d",
                        current, last, ret);
+       return retval;
+}
 
+/*
+ * Check health of a specific health type. Note that if a thread has not yet
+ * initialize its health subsystem or has quit, it's considered in a good
+ * state.
+ *
+ * Return 0 if health is bad or else 1.
+ */
+int health_check_state(enum health_type type)
+{
+       int retval = 1;
+       struct health_state *state;
+
+       assert(type < HEALTH_NUM_TYPE);
+
+       state_lock();
+
+       cds_list_for_each_entry(state, &health_state_list.head, node) {
+               int ret;
+
+               if (state->type != type) {
+                       continue;
+               }
+
+               ret = validate_state(state);
+               if (!ret) {
+                       retval = 0;
+                       goto end;
+               }
+       }
+
+       /* Check the global state since some state might not be visible anymore. */
+       if (global_error_state[type] & HEALTH_ERROR) {
+               retval = 0;
+       }
+
+end:
+       state_unlock();
+
+       DBG("Health check for type %d is %s", (int) type,
+                       (retval == 0) ? "BAD" : "GOOD");
        return retval;
 }
+
+/*
+ * Init health state.
+ */
+void health_register(enum health_type type)
+{
+       assert(type < HEALTH_NUM_TYPE);
+
+       /* Init TLS state. */
+       uatomic_set(&URCU_TLS(health_state).last, 0);
+       uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
+       uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
+       uatomic_set(&URCU_TLS(health_state).current, 0);
+       uatomic_set(&URCU_TLS(health_state).flags, 0);
+       uatomic_set(&URCU_TLS(health_state).type, type);
+
+       /* Add it to the global TLS state list. */
+       state_lock();
+       cds_list_add(&URCU_TLS(health_state).node, &health_state_list.head);
+       state_unlock();
+}
+
+/*
+ * Remove node from global list.
+ */
+void health_unregister(void)
+{
+       state_lock();
+       /*
+        * On error, set the global_error_state since we are about to remove
+        * the node from the global list.
+        */
+       if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
+               uatomic_set(&global_error_state[URCU_TLS(health_state).type],
+                               HEALTH_ERROR);
+       }
+       cds_list_del(&URCU_TLS(health_state).node);
+       state_unlock();
+}
This page took 0.025409 seconds and 4 git commands to generate.