Port: Remove _GNU_SOURCE, defined in config.h
[lttng-tools.git] / src / common / health / health.c
CommitLineData
44a5e5eb
DG
1/*
2 * Copyright (C) 2012 - David Goulet <dgoulet@efficios.com>
8782cc74 3 * Copyright (C) 2013 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
44a5e5eb
DG
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License, version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 51
16 * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 */
18
6c1c0768 19#define _LGPL_SOURCE
44a5e5eb
DG
20#include <assert.h>
21#include <inttypes.h>
22#include <stdio.h>
23#include <stdlib.h>
8809eec0 24#include <time.h>
44a5e5eb 25
8809eec0 26#include <common/defaults.h>
44a5e5eb 27#include <common/error.h>
67e05644
DG
28#include <common/macros.h>
29#include <common/sessiond-comm/inet.h>
44a5e5eb 30
55d09795 31#include <lttng/health-internal.h>
44a5e5eb 32
8782cc74
MD
33/*
34 * An application-specific error state for unregistered thread keeps
35 * track of thread errors. A thread reporting a health error, normally
36 * unregisters and quits. This makes the TLS health state not available
37 * to the health_check_state() call so on unregister we update this
38 * global error array so we can keep track of which thread was on error
39 * if the TLS health state has been removed.
40 */
41struct health_app {
42 /* List of health state, for each application thread */
43 struct cds_list_head list;
44 /*
45 * This lock ensures that TLS memory used for the node and its
46 * container structure don't get reclaimed after the TLS owner
47 * thread exits until we have finished using it.
48 */
49 pthread_mutex_t lock;
50 int nr_types;
51 struct timespec time_delta;
52 /* Health flags containing thread type error state */
53 enum health_flags *flags;
8809eec0
MD
54};
55
927ca06a
DG
56/* Define TLS health state. */
57DEFINE_URCU_TLS(struct health_state, health_state);
58
55d09795
MD
59/*
60 * Initialize health check subsytem.
61 */
62static
63void health_init(struct health_app *ha)
64{
65 /*
66 * Get the maximum value between the default delta value and the TCP
67 * timeout with a safety net of the default health check delta.
68 */
69 ha->time_delta.tv_sec = max_t(unsigned long,
70 lttcomm_inet_tcp_timeout + DEFAULT_HEALTH_CHECK_DELTA_S,
71 ha->time_delta.tv_sec);
72 DBG("Health check time delta in seconds set to %lu",
73 ha->time_delta.tv_sec);
74}
75
8782cc74
MD
76struct health_app *health_app_create(int nr_types)
77{
78 struct health_app *ha;
927ca06a 79
8782cc74
MD
80 ha = zmalloc(sizeof(*ha));
81 if (!ha) {
82 return NULL;
83 }
6c71277b 84 ha->flags = zmalloc(sizeof(*ha->flags) * nr_types);
8782cc74
MD
85 if (!ha->flags) {
86 goto error_flags;
87 }
88 CDS_INIT_LIST_HEAD(&ha->list);
89 pthread_mutex_init(&ha->lock, NULL);
90 ha->nr_types = nr_types;
91 ha->time_delta.tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S;
92 ha->time_delta.tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS;
55d09795 93 health_init(ha);
8782cc74
MD
94 return ha;
95
96error_flags:
97 free(ha);
98 return NULL;
99}
927ca06a 100
8782cc74
MD
101void health_app_destroy(struct health_app *ha)
102{
103 free(ha->flags);
104 free(ha);
105}
927ca06a
DG
106
107/*
108 * Lock health state global list mutex.
109 */
8782cc74 110static void state_lock(struct health_app *ha)
927ca06a 111{
8782cc74 112 pthread_mutex_lock(&ha->lock);
927ca06a
DG
113}
114
115/*
116 * Unlock health state global list mutex.
117 */
8782cc74 118static void state_unlock(struct health_app *ha)
927ca06a 119{
8782cc74 120 pthread_mutex_unlock(&ha->lock);
927ca06a
DG
121}
122
8809eec0
MD
123/*
124 * Set time difference in res from time_a and time_b.
125 */
126static void time_diff(const struct timespec *time_a,
127 const struct timespec *time_b, struct timespec *res)
128{
129 if (time_a->tv_nsec - time_b->tv_nsec < 0) {
130 res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
131 res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
132 } else {
133 res->tv_sec = time_a->tv_sec - time_b->tv_sec;
931a97e5 134 res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
8809eec0
MD
135 }
136}
137
138/*
139 * Return true if time_a - time_b > diff, else false.
140 */
141static int time_diff_gt(const struct timespec *time_a,
142 const struct timespec *time_b, const struct timespec *diff)
143{
144 struct timespec res;
145
146 time_diff(time_a, time_b, &res);
147 time_diff(&res, diff, &res);
148
149 if (res.tv_sec > 0) {
150 return 1;
151 } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
152 return 1;
153 }
154
155 return 0;
156}
157
44a5e5eb 158/*
c89add41 159 * Validate health state. Checks for the error flag or health conditions.
44a5e5eb
DG
160 *
161 * Return 0 if health is bad or else 1.
162 */
8782cc74 163static int validate_state(struct health_app *ha, struct health_state *state)
44a5e5eb 164{
8809eec0 165 int retval = 1, ret;
139ac872 166 unsigned long current, last;
8809eec0 167 struct timespec current_time;
927ca06a 168
c89add41 169 assert(state);
44a5e5eb 170
139ac872 171 last = state->last;
44a5e5eb 172 current = uatomic_read(&state->current);
44a5e5eb 173
8809eec0 174 ret = clock_gettime(CLOCK_MONOTONIC, &current_time);
931a97e5 175 if (ret < 0) {
8809eec0 176 PERROR("Error reading time\n");
139ac872 177 /* error */
8809eec0
MD
178 retval = 0;
179 goto end;
44a5e5eb
DG
180 }
181
8809eec0
MD
182 /*
183 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
184 * health if, after the delta delay has passed, its the progress counter
185 * has not moved and it has NOT been waiting for a poll() call.
186 */
187 if (uatomic_read(&state->flags) & HEALTH_ERROR) {
188 retval = 0;
189 goto end;
190 }
44a5e5eb 191
139ac872 192 /*
8809eec0
MD
193 * Initial condition need to update the last counter and sample time, but
194 * should not check health in this initial case, because we don't know how
195 * much time has passed.
139ac872 196 */
8809eec0
MD
197 if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
198 /* update last counter and last sample time */
199 state->last = current;
200 memcpy(&state->last_time, &current_time, sizeof(current_time));
201 } else {
8782cc74
MD
202 if (time_diff_gt(&current_time, &state->last_time,
203 &ha->time_delta)) {
8809eec0
MD
204 if (current == last && !HEALTH_IS_IN_POLL(current)) {
205 /* error */
206 retval = 0;
207 }
208 /* update last counter and last sample time */
209 state->last = current;
210 memcpy(&state->last_time, &current_time, sizeof(current_time));
c89add41
DG
211
212 /* On error, stop right now and notify caller. */
213 if (retval == 0) {
214 goto end;
215 }
8809eec0
MD
216 }
217 }
218
219end:
77c7c900 220 DBG("Health state current %lu, last %lu, ret %d",
8809eec0 221 current, last, ret);
c89add41
DG
222 return retval;
223}
224
225/*
226 * Check health of a specific health type. Note that if a thread has not yet
227 * initialize its health subsystem or has quit, it's considered in a good
228 * state.
229 *
230 * Return 0 if health is bad or else 1.
231 */
8782cc74 232int health_check_state(struct health_app *ha, int type)
c89add41
DG
233{
234 int retval = 1;
235 struct health_state *state;
236
8782cc74 237 assert(type < ha->nr_types);
c89add41 238
8782cc74 239 state_lock(ha);
c89add41 240
8782cc74 241 cds_list_for_each_entry(state, &ha->list, node) {
c89add41
DG
242 int ret;
243
244 if (state->type != type) {
245 continue;
246 }
247
8782cc74 248 ret = validate_state(ha, state);
c89add41
DG
249 if (!ret) {
250 retval = 0;
251 goto end;
252 }
253 }
254
255 /* Check the global state since some state might not be visible anymore. */
8782cc74 256 if (ha->flags[type] & HEALTH_ERROR) {
c89add41
DG
257 retval = 0;
258 }
259
260end:
8782cc74 261 state_unlock(ha);
139ac872 262
c89add41
DG
263 DBG("Health check for type %d is %s", (int) type,
264 (retval == 0) ? "BAD" : "GOOD");
8809eec0 265 return retval;
44a5e5eb 266}
927ca06a
DG
267
268/*
269 * Init health state.
270 */
8782cc74 271void health_register(struct health_app *ha, int type)
927ca06a 272{
8782cc74 273 assert(type < ha->nr_types);
927ca06a
DG
274
275 /* Init TLS state. */
276 uatomic_set(&URCU_TLS(health_state).last, 0);
277 uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
278 uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
279 uatomic_set(&URCU_TLS(health_state).current, 0);
280 uatomic_set(&URCU_TLS(health_state).flags, 0);
281 uatomic_set(&URCU_TLS(health_state).type, type);
282
283 /* Add it to the global TLS state list. */
8782cc74
MD
284 state_lock(ha);
285 cds_list_add(&URCU_TLS(health_state).node, &ha->list);
286 state_unlock(ha);
927ca06a
DG
287}
288
289/*
290 * Remove node from global list.
291 */
8782cc74 292void health_unregister(struct health_app *ha)
927ca06a 293{
8782cc74 294 state_lock(ha);
927ca06a
DG
295 /*
296 * On error, set the global_error_state since we are about to remove
297 * the node from the global list.
298 */
299 if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
8782cc74 300 uatomic_set(&ha->flags[URCU_TLS(health_state).type],
927ca06a
DG
301 HEALTH_ERROR);
302 }
303 cds_list_del(&URCU_TLS(health_state).node);
8782cc74 304 state_unlock(ha);
927ca06a 305}
This page took 0.047709 seconds and 4 git commands to generate.