3eea3adc0c34f5b6438736f9462b74ea6606bba6
[lttng-tools.git] / src / common / health / health.c
1 /*
2 * Copyright (C) 2012 - David Goulet <dgoulet@efficios.com>
3 * Copyright (C) 2013 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License, version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 51
16 * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 */
18
19 #define _GNU_SOURCE
20 #define _LGPL_SOURCE
21 #include <assert.h>
22 #include <inttypes.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <time.h>
26
27 #include <common/defaults.h>
28 #include <common/error.h>
29 #include <common/macros.h>
30 #include <common/sessiond-comm/inet.h>
31
32 #include <lttng/health-internal.h>
33
34 /*
35 * An application-specific error state for unregistered thread keeps
36 * track of thread errors. A thread reporting a health error, normally
37 * unregisters and quits. This makes the TLS health state not available
38 * to the health_check_state() call so on unregister we update this
39 * global error array so we can keep track of which thread was on error
40 * if the TLS health state has been removed.
41 */
42 struct health_app {
43 /* List of health state, for each application thread */
44 struct cds_list_head list;
45 /*
46 * This lock ensures that TLS memory used for the node and its
47 * container structure don't get reclaimed after the TLS owner
48 * thread exits until we have finished using it.
49 */
50 pthread_mutex_t lock;
51 int nr_types;
52 struct timespec time_delta;
53 /* Health flags containing thread type error state */
54 enum health_flags *flags;
55 };
56
57 /* Define TLS health state. */
58 DEFINE_URCU_TLS(struct health_state, health_state);
59
60 /*
61 * Initialize health check subsytem.
62 */
63 static
64 void health_init(struct health_app *ha)
65 {
66 /*
67 * Get the maximum value between the default delta value and the TCP
68 * timeout with a safety net of the default health check delta.
69 */
70 ha->time_delta.tv_sec = max_t(unsigned long,
71 lttcomm_inet_tcp_timeout + DEFAULT_HEALTH_CHECK_DELTA_S,
72 ha->time_delta.tv_sec);
73 DBG("Health check time delta in seconds set to %lu",
74 ha->time_delta.tv_sec);
75 }
76
77 struct health_app *health_app_create(int nr_types)
78 {
79 struct health_app *ha;
80
81 ha = zmalloc(sizeof(*ha));
82 if (!ha) {
83 return NULL;
84 }
85 ha->flags = zmalloc(sizeof(*ha->flags) * nr_types);
86 if (!ha->flags) {
87 goto error_flags;
88 }
89 CDS_INIT_LIST_HEAD(&ha->list);
90 pthread_mutex_init(&ha->lock, NULL);
91 ha->nr_types = nr_types;
92 ha->time_delta.tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S;
93 ha->time_delta.tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS;
94 health_init(ha);
95 return ha;
96
97 error_flags:
98 free(ha);
99 return NULL;
100 }
101
102 void health_app_destroy(struct health_app *ha)
103 {
104 free(ha->flags);
105 free(ha);
106 }
107
108 /*
109 * Lock health state global list mutex.
110 */
111 static void state_lock(struct health_app *ha)
112 {
113 pthread_mutex_lock(&ha->lock);
114 }
115
116 /*
117 * Unlock health state global list mutex.
118 */
119 static void state_unlock(struct health_app *ha)
120 {
121 pthread_mutex_unlock(&ha->lock);
122 }
123
124 /*
125 * Set time difference in res from time_a and time_b.
126 */
127 static void time_diff(const struct timespec *time_a,
128 const struct timespec *time_b, struct timespec *res)
129 {
130 if (time_a->tv_nsec - time_b->tv_nsec < 0) {
131 res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
132 res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
133 } else {
134 res->tv_sec = time_a->tv_sec - time_b->tv_sec;
135 res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
136 }
137 }
138
139 /*
140 * Return true if time_a - time_b > diff, else false.
141 */
142 static int time_diff_gt(const struct timespec *time_a,
143 const struct timespec *time_b, const struct timespec *diff)
144 {
145 struct timespec res;
146
147 time_diff(time_a, time_b, &res);
148 time_diff(&res, diff, &res);
149
150 if (res.tv_sec > 0) {
151 return 1;
152 } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
153 return 1;
154 }
155
156 return 0;
157 }
158
159 /*
160 * Validate health state. Checks for the error flag or health conditions.
161 *
162 * Return 0 if health is bad or else 1.
163 */
164 static int validate_state(struct health_app *ha, struct health_state *state)
165 {
166 int retval = 1, ret;
167 unsigned long current, last;
168 struct timespec current_time;
169
170 assert(state);
171
172 last = state->last;
173 current = uatomic_read(&state->current);
174
175 ret = clock_gettime(CLOCK_MONOTONIC, &current_time);
176 if (ret < 0) {
177 PERROR("Error reading time\n");
178 /* error */
179 retval = 0;
180 goto end;
181 }
182
183 /*
184 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
185 * health if, after the delta delay has passed, its the progress counter
186 * has not moved and it has NOT been waiting for a poll() call.
187 */
188 if (uatomic_read(&state->flags) & HEALTH_ERROR) {
189 retval = 0;
190 goto end;
191 }
192
193 /*
194 * Initial condition need to update the last counter and sample time, but
195 * should not check health in this initial case, because we don't know how
196 * much time has passed.
197 */
198 if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
199 /* update last counter and last sample time */
200 state->last = current;
201 memcpy(&state->last_time, &current_time, sizeof(current_time));
202 } else {
203 if (time_diff_gt(&current_time, &state->last_time,
204 &ha->time_delta)) {
205 if (current == last && !HEALTH_IS_IN_POLL(current)) {
206 /* error */
207 retval = 0;
208 }
209 /* update last counter and last sample time */
210 state->last = current;
211 memcpy(&state->last_time, &current_time, sizeof(current_time));
212
213 /* On error, stop right now and notify caller. */
214 if (retval == 0) {
215 goto end;
216 }
217 }
218 }
219
220 end:
221 DBG("Health state current %lu, last %lu, ret %d",
222 current, last, ret);
223 return retval;
224 }
225
226 /*
227 * Check health of a specific health type. Note that if a thread has not yet
228 * initialize its health subsystem or has quit, it's considered in a good
229 * state.
230 *
231 * Return 0 if health is bad or else 1.
232 */
233 int health_check_state(struct health_app *ha, int type)
234 {
235 int retval = 1;
236 struct health_state *state;
237
238 assert(type < ha->nr_types);
239
240 state_lock(ha);
241
242 cds_list_for_each_entry(state, &ha->list, node) {
243 int ret;
244
245 if (state->type != type) {
246 continue;
247 }
248
249 ret = validate_state(ha, state);
250 if (!ret) {
251 retval = 0;
252 goto end;
253 }
254 }
255
256 /* Check the global state since some state might not be visible anymore. */
257 if (ha->flags[type] & HEALTH_ERROR) {
258 retval = 0;
259 }
260
261 end:
262 state_unlock(ha);
263
264 DBG("Health check for type %d is %s", (int) type,
265 (retval == 0) ? "BAD" : "GOOD");
266 return retval;
267 }
268
269 /*
270 * Init health state.
271 */
272 void health_register(struct health_app *ha, int type)
273 {
274 assert(type < ha->nr_types);
275
276 /* Init TLS state. */
277 uatomic_set(&URCU_TLS(health_state).last, 0);
278 uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
279 uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
280 uatomic_set(&URCU_TLS(health_state).current, 0);
281 uatomic_set(&URCU_TLS(health_state).flags, 0);
282 uatomic_set(&URCU_TLS(health_state).type, type);
283
284 /* Add it to the global TLS state list. */
285 state_lock(ha);
286 cds_list_add(&URCU_TLS(health_state).node, &ha->list);
287 state_unlock(ha);
288 }
289
290 /*
291 * Remove node from global list.
292 */
293 void health_unregister(struct health_app *ha)
294 {
295 state_lock(ha);
296 /*
297 * On error, set the global_error_state since we are about to remove
298 * the node from the global list.
299 */
300 if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
301 uatomic_set(&ha->flags[URCU_TLS(health_state).type],
302 HEALTH_ERROR);
303 }
304 cds_list_del(&URCU_TLS(health_state).node);
305 state_unlock(ha);
306 }
This page took 0.0342 seconds and 3 git commands to generate.