Fix: health subsystem issues with shared code
authorDavid Goulet <dgoulet@efficios.com>
Thu, 24 Jan 2013 19:20:23 +0000 (14:20 -0500)
committerDavid Goulet <dgoulet@efficios.com>
Thu, 24 Jan 2013 21:32:52 +0000 (16:32 -0500)
TLS memory is not used for health state of each thread. This commit
probably fixes bug428 as well.

The health_init/exit are renamed to health_register/unregister.

Fixes #411

Signed-off-by: David Goulet <dgoulet@efficios.com>
src/bin/lttng-sessiond/health.c
src/bin/lttng-sessiond/health.h
src/bin/lttng-sessiond/main.c

index d382c04b536cc060fba5cb181e5e156696264924..b36ddc4a337b6bc6c5aa389d45c004a0c3be9580 100644 (file)
@@ -32,6 +32,45 @@ static const struct timespec time_delta = {
        .tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS,
 };
 
+/* Define TLS health state. */
+DEFINE_URCU_TLS(struct health_state, health_state);
+
+/*
+ * It ensures that TLS memory used for the node and its container structure
+ * don't get reclaimed after the TLS owner thread exits until we have finished
+ * using it.
+ */
+static pthread_mutex_t health_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static struct health_tls_state_list health_state_list = {
+       .head = CDS_LIST_HEAD_INIT(health_state_list.head),
+};
+
+/*
+ * This keeps track of the error state for unregistered thread. A thread
+ * reporting a health error, normally unregisters and quits. This makes the TLS
+ * health state not available to the health_check_state() call so on unregister
+ * we update this global error array so we can keep track of which thread was
+ * on error if the TLS health state has been removed.
+ */
+static enum health_flags global_error_state[HEALTH_NUM_TYPE];
+
+/*
+ * Lock health state global list mutex.
+ */
+static void state_lock(void)
+{
+       pthread_mutex_lock(&health_mutex);
+}
+
+/*
+ * Unlock health state global list mutex.
+ */
+static void state_unlock(void)
+{
+       pthread_mutex_unlock(&health_mutex);
+}
+
 /*
  * Set time difference in res from time_a and time_b.
  */
@@ -68,17 +107,51 @@ static int time_diff_gt(const struct timespec *time_a,
 }
 
 /*
- * Check health of a specific health state counter.
+ * Health mutex MUST be held across use of the returned struct health_state to
+ * provide existence guarantee.
+ *
+ * Return the health_state object or NULL if not found.
+ */
+static struct health_state *find_health_state(enum health_type type)
+{
+       struct health_state *state;
+
+       /* Find the right health state in the global TLS list. */
+       cds_list_for_each_entry(state, &health_state_list.head, node) {
+               if (state->type == type) {
+                       return state;
+               }
+       }
+
+       return NULL;
+}
+
+/*
+ * Check health of a specific health type. Note that if a thread has not yet
+ * initialize its health subsystem or has quit, it's considered in a good
+ * state.
  *
  * Return 0 if health is bad or else 1.
  */
-int health_check_state(struct health_state *state)
+int health_check_state(enum health_type type)
 {
        int retval = 1, ret;
        unsigned long current, last;
        struct timespec current_time;
+       struct health_state *state;
+
+       assert(type < HEALTH_NUM_TYPE);
 
-       assert(state);
+       state_lock();
+
+       state = find_health_state(type);
+       if (!state) {
+               /* Check the global state since the state is not visiable anymore. */
+               if (global_error_state[type] & HEALTH_ERROR) {
+                       retval = 0;
+               }
+               goto not_found;
+       }
 
        last = state->last;
        current = uatomic_read(&state->current);
@@ -125,6 +198,56 @@ int health_check_state(struct health_state *state)
 end:
        DBG("Health state current %lu, last %lu, ret %d",
                        current, last, ret);
+not_found:
+       state_unlock();
 
        return retval;
 }
+
+/*
+ * Init health state.
+ */
+void health_register(enum health_type type)
+{
+       struct health_state *state;
+
+       assert(type < HEALTH_NUM_TYPE);
+
+       /* Init TLS state. */
+       uatomic_set(&URCU_TLS(health_state).last, 0);
+       uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
+       uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
+       uatomic_set(&URCU_TLS(health_state).current, 0);
+       uatomic_set(&URCU_TLS(health_state).flags, 0);
+       uatomic_set(&URCU_TLS(health_state).type, type);
+
+       /* Add it to the global TLS state list. */
+       state_lock();
+       state = find_health_state(type);
+       /*
+        * Duplicates are not accepted, since lookups don't handle them at the
+        * moment.
+        */
+       assert(!state);
+
+       cds_list_add(&URCU_TLS(health_state).node, &health_state_list.head);
+       state_unlock();
+}
+
+/*
+ * Remove node from global list.
+ */
+void health_unregister(void)
+{
+       state_lock();
+       /*
+        * On error, set the global_error_state since we are about to remove
+        * the node from the global list.
+        */
+       if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
+               uatomic_set(&global_error_state[URCU_TLS(health_state).type],
+                               HEALTH_ERROR);
+       }
+       cds_list_del(&URCU_TLS(health_state).node);
+       state_unlock();
+}
index 19880fa377d0d61e192853a52c6154e301fdeaac..91a90706a77928acbcaa2fdd83bf8b958f69e7c1 100644 (file)
 
 #include <assert.h>
 #include <time.h>
+#include <pthread.h>
+#include <urcu/tls-compat.h>
 #include <urcu/uatomic.h>
+#include <urcu/list.h>
 
 /*
  * These are the value added to the current state depending of the position in
 #define HEALTH_IS_IN_POLL(x)   ((x) & HEALTH_POLL_VALUE)
 
 enum health_flags {
-       HEALTH_EXIT =  (1U << 0),
-       HEALTH_ERROR = (1U << 1),
+       HEALTH_ERROR = (1U << 0),
+};
+
+enum health_type {
+       HEALTH_TYPE_CMD                 = 0,
+       HEALTH_TYPE_APP_MANAGE  = 1,
+       HEALTH_TYPE_APP_REG             = 2,
+       HEALTH_TYPE_KERNEL              = 3,
+       HEALTH_TYPE_CONSUMER    = 4,
+
+       HEALTH_NUM_TYPE,
+};
+
+struct health_tls_state_list {
+       struct cds_list_head head;
 };
 
 struct health_state {
@@ -49,8 +65,14 @@ struct health_state {
         */
        unsigned long current;          /* progress counter, updated atomically */
        enum health_flags flags;        /* other flags, updated atomically */
+       enum health_type type;          /* Indicates the nature of the thread. */
+       /* Node of the global TLS state list. */
+       struct cds_list_head node;
 };
 
+/* Declare TLS health state. */
+extern DECLARE_URCU_TLS(struct health_state, health_state);
+
 /* Health state counters for the client command thread */
 extern struct health_state health_thread_cmd;
 
@@ -69,8 +91,7 @@ extern struct health_state health_thread_kernel;
  */
 static inline void health_poll_update(struct health_state *state)
 {
-       assert(state);
-       uatomic_add(&state->current, HEALTH_POLL_VALUE);
+       uatomic_add(&URCU_TLS(health_state).current, HEALTH_POLL_VALUE);
 }
 
 /*
@@ -79,17 +100,7 @@ static inline void health_poll_update(struct health_state *state)
  */
 static inline void health_code_update(struct health_state *state)
 {
-       assert(state);
-       uatomic_add(&state->current, HEALTH_CODE_VALUE);
-}
-
-/*
- * Set health "exit" flag.
- */
-static inline void health_exit(struct health_state *state)
-{
-       assert(state);
-       uatomic_or(&state->flags, HEALTH_EXIT);
+       uatomic_add(&URCU_TLS(health_state).current, HEALTH_CODE_VALUE);
 }
 
 /*
@@ -97,23 +108,11 @@ static inline void health_exit(struct health_state *state)
  */
 static inline void health_error(struct health_state *state)
 {
-       assert(state);
-       uatomic_or(&state->flags, HEALTH_ERROR);
-}
-
-/*
- * Init health state.
- */
-static inline void health_init(struct health_state *state)
-{
-       assert(state);
-       state->last = 0;
-       state->last_time.tv_sec = 0;
-       state->last_time.tv_nsec = 0;
-       uatomic_set(&state->current, 0);
-       uatomic_set(&state->flags, 0);
+       uatomic_or(&URCU_TLS(health_state).flags, HEALTH_ERROR);
 }
 
-int health_check_state(struct health_state *state);
+int health_check_state(enum health_type type);
+void health_register(enum health_type type);
+void health_unregister(void);
 
 #endif /* _HEALTH_H */
index a63593d18030c0c1f88c34f4f331c6de2fdf2f47..d999928feb8a0f06bfe24857fc3182f56c4f903f 100644 (file)
@@ -705,6 +705,8 @@ static void *thread_manage_kernel(void *data)
 
        DBG("[thread] Thread manage kernel started");
 
+       health_register(HEALTH_TYPE_KERNEL);
+
        /*
         * This first step of the while is to clean this structure which could free
         * non NULL pointers so zero it before the loop.
@@ -828,7 +830,7 @@ error_testpoint:
                WARN("Kernel thread died unexpectedly. "
                                "Kernel tracing can continue but CPU hotplug is disabled.");
        }
-       health_exit(&health_thread_kernel);
+       health_unregister();
        DBG("Kernel thread dying");
        return NULL;
 }
@@ -869,6 +871,8 @@ static void *thread_manage_consumer(void *data)
 
        DBG("[thread] Manage consumer started");
 
+       health_register(HEALTH_TYPE_CONSUMER);
+
        /*
         * Since the consumer thread can be spawned at any moment in time, we init
         * the health to a poll status (1, which is a valid health over time).
@@ -1102,7 +1106,7 @@ error_poll:
                health_error(&consumer_data->health);
                ERR("Health error occurred in %s", __func__);
        }
-       health_exit(&consumer_data->health);
+       health_unregister();
        DBG("consumer thread cleanup completed");
 
        return NULL;
@@ -1123,6 +1127,8 @@ static void *thread_manage_apps(void *data)
        rcu_register_thread();
        rcu_thread_online();
 
+       health_register(HEALTH_TYPE_APP_MANAGE);
+
        if (testpoint(thread_manage_apps)) {
                goto error_testpoint;
        }
@@ -1294,7 +1300,7 @@ error_testpoint:
                health_error(&health_thread_app_manage);
                ERR("Health error occurred in %s", __func__);
        }
-       health_exit(&health_thread_app_manage);
+       health_unregister();
        DBG("Application communication apps thread cleanup complete");
        rcu_thread_offline();
        rcu_unregister_thread();
@@ -1391,6 +1397,8 @@ static void *thread_registration_apps(void *data)
 
        DBG("[thread] Manage application registration started");
 
+       health_register(HEALTH_TYPE_APP_REG);
+
        if (testpoint(thread_registration_apps)) {
                goto error_testpoint;
        }
@@ -1573,7 +1581,7 @@ error_listen:
 error_create_poll:
 error_testpoint:
        DBG("UST Registration thread cleanup complete");
-       health_exit(&health_thread_app_reg);
+       health_unregister();
 
        return NULL;
 }
@@ -1948,9 +1956,7 @@ static int check_consumer_health(void)
 {
        int ret;
 
-       ret = health_check_state(&kconsumer_data.health) &&
-               health_check_state(&ustconsumer32_data.health) &&
-               health_check_state(&ustconsumer64_data.health);
+       ret = health_check_state(HEALTH_TYPE_CONSUMER);
 
        DBG3("Health consumer check %d", ret);
 
@@ -3068,26 +3074,26 @@ restart:
 
                switch (msg.component) {
                case LTTNG_HEALTH_CMD:
-                       reply.ret_code = health_check_state(&health_thread_cmd);
+                       reply.ret_code = health_check_state(HEALTH_TYPE_CMD);
                        break;
                case LTTNG_HEALTH_APP_MANAGE:
-                       reply.ret_code = health_check_state(&health_thread_app_manage);
+                       reply.ret_code = health_check_state(HEALTH_TYPE_APP_MANAGE);
                        break;
                case LTTNG_HEALTH_APP_REG:
-                       reply.ret_code = health_check_state(&health_thread_app_reg);
+                       reply.ret_code = health_check_state(HEALTH_TYPE_APP_REG);
                        break;
                case LTTNG_HEALTH_KERNEL:
-                       reply.ret_code = health_check_state(&health_thread_kernel);
+                       reply.ret_code = health_check_state(HEALTH_TYPE_KERNEL);
                        break;
                case LTTNG_HEALTH_CONSUMER:
                        reply.ret_code = check_consumer_health();
                        break;
                case LTTNG_HEALTH_ALL:
                        reply.ret_code =
-                               health_check_state(&health_thread_app_manage) &&
-                               health_check_state(&health_thread_app_reg) &&
-                               health_check_state(&health_thread_cmd) &&
-                               health_check_state(&health_thread_kernel) &&
+                               health_check_state(HEALTH_TYPE_APP_MANAGE) &&
+                               health_check_state(HEALTH_TYPE_APP_REG) &&
+                               health_check_state(HEALTH_TYPE_CMD) &&
+                               health_check_state(HEALTH_TYPE_KERNEL) &&
                                check_consumer_health();
                        break;
                default:
@@ -3162,6 +3168,8 @@ static void *thread_manage_clients(void *data)
 
        rcu_register_thread();
 
+       health_register(HEALTH_TYPE_CMD);
+
        if (testpoint(thread_manage_clients)) {
                goto error_testpoint;
        }
@@ -3386,7 +3394,7 @@ error_testpoint:
                ERR("Health error occurred in %s", __func__);
        }
 
-       health_exit(&health_thread_cmd);
+       health_unregister();
 
        DBG("Client thread dying");
 
@@ -4167,26 +4175,6 @@ int main(int argc, char **argv)
 
        cmd_init();
 
-       /* Init all health thread counters. */
-       health_init(&health_thread_cmd);
-       health_init(&health_thread_kernel);
-       health_init(&health_thread_app_manage);
-       health_init(&health_thread_app_reg);
-
-       /*
-        * Init health counters of the consumer thread. We do a quick hack here to
-        * the state of the consumer health is fine even if the thread is not
-        * started. Once the thread starts, the health state is updated with a poll
-        * value to set a health code path. This is simply to ease our life and has
-        * no cost what so ever.
-        */
-       health_init(&kconsumer_data.health);
-       health_poll_update(&kconsumer_data.health);
-       health_init(&ustconsumer32_data.health);
-       health_poll_update(&ustconsumer32_data.health);
-       health_init(&ustconsumer64_data.health);
-       health_poll_update(&ustconsumer64_data.health);
-
        /* Check for the application socket timeout env variable. */
        env_app_timeout = getenv(DEFAULT_APP_SOCKET_TIMEOUT_ENV);
        if (env_app_timeout) {
This page took 0.030703 seconds and 4 git commands to generate.