X-Git-Url: https://git.lttng.org/?p=lttng-tools.git;a=blobdiff_plain;f=src%2Fbin%2Flttng-sessiond%2Fhealth.c;h=b36ddc4a337b6bc6c5aa389d45c004a0c3be9580;hp=6c4de9430793fa1d8508c06dab17e88af0b6bcae;hb=927ca06aed61ff6dd3f64ae71854f2d7f9acebe5;hpb=8809eec0bb55b03862cb1eb128eb39d50104c258 diff --git a/src/bin/lttng-sessiond/health.c b/src/bin/lttng-sessiond/health.c index 6c4de9430..b36ddc4a3 100644 --- a/src/bin/lttng-sessiond/health.c +++ b/src/bin/lttng-sessiond/health.c @@ -32,6 +32,45 @@ static const struct timespec time_delta = { .tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS, }; +/* Define TLS health state. */ +DEFINE_URCU_TLS(struct health_state, health_state); + +/* + * It ensures that TLS memory used for the node and its container structure + * don't get reclaimed after the TLS owner thread exits until we have finished + * using it. + */ +static pthread_mutex_t health_mutex = PTHREAD_MUTEX_INITIALIZER; + +static struct health_tls_state_list health_state_list = { + .head = CDS_LIST_HEAD_INIT(health_state_list.head), +}; + +/* + * This keeps track of the error state for unregistered thread. A thread + * reporting a health error, normally unregisters and quits. This makes the TLS + * health state not available to the health_check_state() call so on unregister + * we update this global error array so we can keep track of which thread was + * on error if the TLS health state has been removed. + */ +static enum health_flags global_error_state[HEALTH_NUM_TYPE]; + +/* + * Lock health state global list mutex. + */ +static void state_lock(void) +{ + pthread_mutex_lock(&health_mutex); +} + +/* + * Unlock health state global list mutex. + */ +static void state_unlock(void) +{ + pthread_mutex_unlock(&health_mutex); +} + /* * Set time difference in res from time_a and time_b. */ @@ -43,7 +82,7 @@ static void time_diff(const struct timespec *time_a, res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec; } else { res->tv_sec = time_a->tv_sec - time_b->tv_sec; - res->tv_nsec = time_a->tv_sec - time_b->tv_sec; + res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec; } } @@ -68,23 +107,57 @@ static int time_diff_gt(const struct timespec *time_a, } /* - * Check health of a specific health state counter. + * Health mutex MUST be held across use of the returned struct health_state to + * provide existence guarantee. + * + * Return the health_state object or NULL if not found. + */ +static struct health_state *find_health_state(enum health_type type) +{ + struct health_state *state; + + /* Find the right health state in the global TLS list. */ + cds_list_for_each_entry(state, &health_state_list.head, node) { + if (state->type == type) { + return state; + } + } + + return NULL; +} + +/* + * Check health of a specific health type. Note that if a thread has not yet + * initialize its health subsystem or has quit, it's considered in a good + * state. * * Return 0 if health is bad or else 1. */ -int health_check_state(struct health_state *state) +int health_check_state(enum health_type type) { int retval = 1, ret; unsigned long current, last; struct timespec current_time; + struct health_state *state; + + assert(type < HEALTH_NUM_TYPE); - assert(state); + state_lock(); + + state = find_health_state(type); + if (!state) { + /* Check the global state since the state is not visiable anymore. */ + if (global_error_state[type] & HEALTH_ERROR) { + retval = 0; + } + goto not_found; + } last = state->last; current = uatomic_read(&state->current); ret = clock_gettime(CLOCK_MONOTONIC, ¤t_time); - if (ret) { + if (ret < 0) { PERROR("Error reading time\n"); /* error */ retval = 0; @@ -123,8 +196,58 @@ int health_check_state(struct health_state *state) } end: - DBG("Health state current %" PRIu64 ", last %" PRIu64 ", ret %d", + DBG("Health state current %lu, last %lu, ret %d", current, last, ret); +not_found: + state_unlock(); return retval; } + +/* + * Init health state. + */ +void health_register(enum health_type type) +{ + struct health_state *state; + + assert(type < HEALTH_NUM_TYPE); + + /* Init TLS state. */ + uatomic_set(&URCU_TLS(health_state).last, 0); + uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0); + uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0); + uatomic_set(&URCU_TLS(health_state).current, 0); + uatomic_set(&URCU_TLS(health_state).flags, 0); + uatomic_set(&URCU_TLS(health_state).type, type); + + /* Add it to the global TLS state list. */ + state_lock(); + state = find_health_state(type); + /* + * Duplicates are not accepted, since lookups don't handle them at the + * moment. + */ + assert(!state); + + cds_list_add(&URCU_TLS(health_state).node, &health_state_list.head); + state_unlock(); +} + +/* + * Remove node from global list. + */ +void health_unregister(void) +{ + state_lock(); + /* + * On error, set the global_error_state since we are about to remove + * the node from the global list. + */ + if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) { + uatomic_set(&global_error_state[URCU_TLS(health_state).type], + HEALTH_ERROR); + } + cds_list_del(&URCU_TLS(health_state).node); + state_unlock(); +}