/*
* Copyright (C) 2012 - David Goulet <dgoulet@efficios.com>
+ * Copyright (C) 2013 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License, version 2 only, as
#include <common/defaults.h>
#include <common/error.h>
+#include <common/macros.h>
+#include <common/sessiond-comm/inet.h>
#include "health.h"
-static const struct timespec time_delta = {
- .tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S,
- .tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS,
+/*
+ * An application-specific error state for unregistered thread keeps
+ * track of thread errors. A thread reporting a health error, normally
+ * unregisters and quits. This makes the TLS health state not available
+ * to the health_check_state() call so on unregister we update this
+ * global error array so we can keep track of which thread was on error
+ * if the TLS health state has been removed.
+ */
+struct health_app {
+ /* List of health state, for each application thread */
+ struct cds_list_head list;
+ /*
+ * This lock ensures that TLS memory used for the node and its
+ * container structure don't get reclaimed after the TLS owner
+ * thread exits until we have finished using it.
+ */
+ pthread_mutex_t lock;
+ int nr_types;
+ struct timespec time_delta;
+ /* Health flags containing thread type error state */
+ enum health_flags *flags;
};
/* Define TLS health state. */
DEFINE_URCU_TLS(struct health_state, health_state);
-/*
- * It ensures that TLS memory used for the node and its container structure
- * don't get reclaimed after the TLS owner thread exits until we have finished
- * using it.
- */
-static pthread_mutex_t health_mutex = PTHREAD_MUTEX_INITIALIZER;
+struct health_app *health_app_create(int nr_types)
+{
+ struct health_app *ha;
-static struct health_tls_state_list health_state_list = {
- .head = CDS_LIST_HEAD_INIT(health_state_list.head),
-};
+ ha = zmalloc(sizeof(*ha));
+ if (!ha) {
+ return NULL;
+ }
+ ha->flags = zmalloc(sizeof(*ha->flags));
+ if (!ha->flags) {
+ goto error_flags;
+ }
+ CDS_INIT_LIST_HEAD(&ha->list);
+ pthread_mutex_init(&ha->lock, NULL);
+ ha->nr_types = nr_types;
+ ha->time_delta.tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S;
+ ha->time_delta.tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS;
+ return ha;
+
+error_flags:
+ free(ha);
+ return NULL;
+}
-/*
- * This keeps track of the error state for unregistered thread. A thread
- * reporting a health error, normally unregisters and quits. This makes the TLS
- * health state not available to the health_check_state() call so on unregister
- * we update this global error array so we can keep track of which thread was
- * on error if the TLS health state has been removed.
- */
-static enum health_flags global_error_state[HEALTH_NUM_TYPE];
+void health_app_destroy(struct health_app *ha)
+{
+ free(ha->flags);
+ free(ha);
+}
/*
* Lock health state global list mutex.
*/
-static void state_lock(void)
+static void state_lock(struct health_app *ha)
{
- pthread_mutex_lock(&health_mutex);
+ pthread_mutex_lock(&ha->lock);
}
/*
* Unlock health state global list mutex.
*/
-static void state_unlock(void)
+static void state_unlock(struct health_app *ha)
{
- pthread_mutex_unlock(&health_mutex);
+ pthread_mutex_unlock(&ha->lock);
}
/*
}
/*
- * Health mutex MUST be held across use of the returned struct health_state to
- * provide existence guarantee.
- *
- * Return the health_state object or NULL if not found.
- */
-static struct health_state *find_health_state(enum health_type type)
-{
- struct health_state *state;
-
- /* Find the right health state in the global TLS list. */
- cds_list_for_each_entry(state, &health_state_list.head, node) {
- if (state->type == type) {
- return state;
- }
- }
-
- return NULL;
-}
-
-/*
- * Check health of a specific health type. Note that if a thread has not yet
- * initialize its health subsystem or has quit, it's considered in a good
- * state.
+ * Validate health state. Checks for the error flag or health conditions.
*
* Return 0 if health is bad or else 1.
*/
-int health_check_state(enum health_type type)
+static int validate_state(struct health_app *ha, struct health_state *state)
{
int retval = 1, ret;
unsigned long current, last;
struct timespec current_time;
- struct health_state *state;
-
- assert(type < HEALTH_NUM_TYPE);
- state_lock();
-
- state = find_health_state(type);
- if (!state) {
- /* Check the global state since the state is not visiable anymore. */
- if (global_error_state[type] & HEALTH_ERROR) {
- retval = 0;
- }
- goto not_found;
- }
+ assert(state);
last = state->last;
current = uatomic_read(&state->current);
state->last = current;
memcpy(&state->last_time, ¤t_time, sizeof(current_time));
} else {
- if (time_diff_gt(¤t_time, &state->last_time, &time_delta)) {
+ if (time_diff_gt(¤t_time, &state->last_time,
+ &ha->time_delta)) {
if (current == last && !HEALTH_IS_IN_POLL(current)) {
/* error */
retval = 0;
/* update last counter and last sample time */
state->last = current;
memcpy(&state->last_time, ¤t_time, sizeof(current_time));
+
+ /* On error, stop right now and notify caller. */
+ if (retval == 0) {
+ goto end;
+ }
}
}
end:
DBG("Health state current %lu, last %lu, ret %d",
current, last, ret);
-not_found:
- state_unlock();
-
return retval;
}
/*
- * Init health state.
+ * Check health of a specific health type. Note that if a thread has not yet
+ * initialize its health subsystem or has quit, it's considered in a good
+ * state.
+ *
+ * Return 0 if health is bad or else 1.
*/
-void health_register(enum health_type type)
+int health_check_state(struct health_app *ha, int type)
{
+ int retval = 1;
struct health_state *state;
- assert(type < HEALTH_NUM_TYPE);
+ assert(type < ha->nr_types);
+
+ state_lock(ha);
+
+ cds_list_for_each_entry(state, &ha->list, node) {
+ int ret;
+
+ if (state->type != type) {
+ continue;
+ }
+
+ ret = validate_state(ha, state);
+ if (!ret) {
+ retval = 0;
+ goto end;
+ }
+ }
+
+ /* Check the global state since some state might not be visible anymore. */
+ if (ha->flags[type] & HEALTH_ERROR) {
+ retval = 0;
+ }
+
+end:
+ state_unlock(ha);
+
+ DBG("Health check for type %d is %s", (int) type,
+ (retval == 0) ? "BAD" : "GOOD");
+ return retval;
+}
+
+/*
+ * Init health state.
+ */
+void health_register(struct health_app *ha, int type)
+{
+ assert(type < ha->nr_types);
/* Init TLS state. */
uatomic_set(&URCU_TLS(health_state).last, 0);
uatomic_set(&URCU_TLS(health_state).type, type);
/* Add it to the global TLS state list. */
- state_lock();
- state = find_health_state(type);
- /*
- * Duplicates are not accepted, since lookups don't handle them at the
- * moment.
- */
- assert(!state);
-
- cds_list_add(&URCU_TLS(health_state).node, &health_state_list.head);
- state_unlock();
+ state_lock(ha);
+ cds_list_add(&URCU_TLS(health_state).node, &ha->list);
+ state_unlock(ha);
}
/*
* Remove node from global list.
*/
-void health_unregister(void)
+void health_unregister(struct health_app *ha)
{
- state_lock();
+ state_lock(ha);
/*
* On error, set the global_error_state since we are about to remove
* the node from the global list.
*/
if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
- uatomic_set(&global_error_state[URCU_TLS(health_state).type],
+ uatomic_set(&ha->flags[URCU_TLS(health_state).type],
HEALTH_ERROR);
}
cds_list_del(&URCU_TLS(health_state).node);
- state_unlock();
+ state_unlock(ha);
+}
+
+/*
+ * Initiliazie health check subsytem. This should be called before any health
+ * register occurs.
+ */
+void health_init(struct health_app *ha)
+{
+ /*
+ * Get the maximum value between the default delta value and the TCP
+ * timeout with a safety net of the default health check delta.
+ */
+ ha->time_delta.tv_sec = max_t(unsigned long,
+ lttcomm_inet_tcp_timeout + DEFAULT_HEALTH_CHECK_DELTA_S,
+ ha->time_delta.tv_sec);
+ DBG("Health check time delta in seconds set to %lu",
+ ha->time_delta.tv_sec);
}