Force usage of assert() condition when NDEBUG is defined
[lttng-tools.git] / src / common / health / health.c
CommitLineData
44a5e5eb 1/*
ab5be9fa
MJ
2 * Copyright (C) 2012 David Goulet <dgoulet@efficios.com>
3 * Copyright (C) 2013 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
44a5e5eb 4 *
ab5be9fa 5 * SPDX-License-Identifier: GPL-2.0-only
44a5e5eb 6 *
44a5e5eb
DG
7 */
8
6c1c0768 9#define _LGPL_SOURCE
44a5e5eb
DG
10#include <inttypes.h>
11#include <stdio.h>
12#include <stdlib.h>
8809eec0 13#include <time.h>
44a5e5eb 14
8809eec0 15#include <common/defaults.h>
44a5e5eb 16#include <common/error.h>
67e05644
DG
17#include <common/macros.h>
18#include <common/sessiond-comm/inet.h>
44a5e5eb 19
55d09795 20#include <lttng/health-internal.h>
44a5e5eb 21
8782cc74
MD
22/*
23 * An application-specific error state for unregistered thread keeps
24 * track of thread errors. A thread reporting a health error, normally
25 * unregisters and quits. This makes the TLS health state not available
26 * to the health_check_state() call so on unregister we update this
27 * global error array so we can keep track of which thread was on error
28 * if the TLS health state has been removed.
29 */
30struct health_app {
31 /* List of health state, for each application thread */
32 struct cds_list_head list;
33 /*
34 * This lock ensures that TLS memory used for the node and its
35 * container structure don't get reclaimed after the TLS owner
36 * thread exits until we have finished using it.
37 */
38 pthread_mutex_t lock;
39 int nr_types;
40 struct timespec time_delta;
41 /* Health flags containing thread type error state */
42 enum health_flags *flags;
8809eec0
MD
43};
44
927ca06a
DG
45/* Define TLS health state. */
46DEFINE_URCU_TLS(struct health_state, health_state);
47
55d09795
MD
48/*
49 * Initialize health check subsytem.
50 */
51static
52void health_init(struct health_app *ha)
53{
54 /*
55 * Get the maximum value between the default delta value and the TCP
56 * timeout with a safety net of the default health check delta.
57 */
58 ha->time_delta.tv_sec = max_t(unsigned long,
59 lttcomm_inet_tcp_timeout + DEFAULT_HEALTH_CHECK_DELTA_S,
60 ha->time_delta.tv_sec);
61 DBG("Health check time delta in seconds set to %lu",
62 ha->time_delta.tv_sec);
63}
64
8782cc74
MD
65struct health_app *health_app_create(int nr_types)
66{
67 struct health_app *ha;
927ca06a 68
8782cc74
MD
69 ha = zmalloc(sizeof(*ha));
70 if (!ha) {
71 return NULL;
72 }
6c71277b 73 ha->flags = zmalloc(sizeof(*ha->flags) * nr_types);
8782cc74
MD
74 if (!ha->flags) {
75 goto error_flags;
76 }
77 CDS_INIT_LIST_HEAD(&ha->list);
78 pthread_mutex_init(&ha->lock, NULL);
79 ha->nr_types = nr_types;
80 ha->time_delta.tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S;
81 ha->time_delta.tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS;
55d09795 82 health_init(ha);
8782cc74
MD
83 return ha;
84
85error_flags:
86 free(ha);
87 return NULL;
88}
927ca06a 89
8782cc74
MD
90void health_app_destroy(struct health_app *ha)
91{
92 free(ha->flags);
93 free(ha);
94}
927ca06a
DG
95
96/*
97 * Lock health state global list mutex.
98 */
8782cc74 99static void state_lock(struct health_app *ha)
927ca06a 100{
8782cc74 101 pthread_mutex_lock(&ha->lock);
927ca06a
DG
102}
103
104/*
105 * Unlock health state global list mutex.
106 */
8782cc74 107static void state_unlock(struct health_app *ha)
927ca06a 108{
8782cc74 109 pthread_mutex_unlock(&ha->lock);
927ca06a
DG
110}
111
8809eec0
MD
112/*
113 * Set time difference in res from time_a and time_b.
114 */
115static void time_diff(const struct timespec *time_a,
116 const struct timespec *time_b, struct timespec *res)
117{
118 if (time_a->tv_nsec - time_b->tv_nsec < 0) {
119 res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
120 res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
121 } else {
122 res->tv_sec = time_a->tv_sec - time_b->tv_sec;
931a97e5 123 res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
8809eec0
MD
124 }
125}
126
127/*
128 * Return true if time_a - time_b > diff, else false.
129 */
130static int time_diff_gt(const struct timespec *time_a,
131 const struct timespec *time_b, const struct timespec *diff)
132{
133 struct timespec res;
134
135 time_diff(time_a, time_b, &res);
136 time_diff(&res, diff, &res);
137
138 if (res.tv_sec > 0) {
139 return 1;
140 } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
141 return 1;
142 }
143
144 return 0;
145}
146
44a5e5eb 147/*
c89add41 148 * Validate health state. Checks for the error flag or health conditions.
44a5e5eb
DG
149 *
150 * Return 0 if health is bad or else 1.
151 */
8782cc74 152static int validate_state(struct health_app *ha, struct health_state *state)
44a5e5eb 153{
8809eec0 154 int retval = 1, ret;
139ac872 155 unsigned long current, last;
8809eec0 156 struct timespec current_time;
927ca06a 157
a0377dfe 158 LTTNG_ASSERT(state);
44a5e5eb 159
139ac872 160 last = state->last;
44a5e5eb 161 current = uatomic_read(&state->current);
44a5e5eb 162
389fbf04 163 ret = lttng_clock_gettime(CLOCK_MONOTONIC, &current_time);
931a97e5 164 if (ret < 0) {
8809eec0 165 PERROR("Error reading time\n");
139ac872 166 /* error */
8809eec0
MD
167 retval = 0;
168 goto end;
44a5e5eb
DG
169 }
170
8809eec0
MD
171 /*
172 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
173 * health if, after the delta delay has passed, its the progress counter
174 * has not moved and it has NOT been waiting for a poll() call.
175 */
176 if (uatomic_read(&state->flags) & HEALTH_ERROR) {
177 retval = 0;
178 goto end;
179 }
44a5e5eb 180
139ac872 181 /*
8809eec0
MD
182 * Initial condition need to update the last counter and sample time, but
183 * should not check health in this initial case, because we don't know how
184 * much time has passed.
139ac872 185 */
8809eec0
MD
186 if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
187 /* update last counter and last sample time */
188 state->last = current;
189 memcpy(&state->last_time, &current_time, sizeof(current_time));
190 } else {
8782cc74
MD
191 if (time_diff_gt(&current_time, &state->last_time,
192 &ha->time_delta)) {
8809eec0
MD
193 if (current == last && !HEALTH_IS_IN_POLL(current)) {
194 /* error */
195 retval = 0;
196 }
197 /* update last counter and last sample time */
198 state->last = current;
199 memcpy(&state->last_time, &current_time, sizeof(current_time));
c89add41
DG
200
201 /* On error, stop right now and notify caller. */
202 if (retval == 0) {
203 goto end;
204 }
8809eec0
MD
205 }
206 }
207
208end:
77c7c900 209 DBG("Health state current %lu, last %lu, ret %d",
8809eec0 210 current, last, ret);
c89add41
DG
211 return retval;
212}
213
214/*
215 * Check health of a specific health type. Note that if a thread has not yet
216 * initialize its health subsystem or has quit, it's considered in a good
217 * state.
218 *
219 * Return 0 if health is bad or else 1.
220 */
8782cc74 221int health_check_state(struct health_app *ha, int type)
c89add41
DG
222{
223 int retval = 1;
224 struct health_state *state;
225
a0377dfe 226 LTTNG_ASSERT(type < ha->nr_types);
c89add41 227
8782cc74 228 state_lock(ha);
c89add41 229
8782cc74 230 cds_list_for_each_entry(state, &ha->list, node) {
c89add41
DG
231 int ret;
232
233 if (state->type != type) {
234 continue;
235 }
236
8782cc74 237 ret = validate_state(ha, state);
c89add41
DG
238 if (!ret) {
239 retval = 0;
240 goto end;
241 }
242 }
243
244 /* Check the global state since some state might not be visible anymore. */
8782cc74 245 if (ha->flags[type] & HEALTH_ERROR) {
c89add41
DG
246 retval = 0;
247 }
248
249end:
8782cc74 250 state_unlock(ha);
139ac872 251
c89add41
DG
252 DBG("Health check for type %d is %s", (int) type,
253 (retval == 0) ? "BAD" : "GOOD");
8809eec0 254 return retval;
44a5e5eb 255}
927ca06a
DG
256
257/*
258 * Init health state.
259 */
8782cc74 260void health_register(struct health_app *ha, int type)
927ca06a 261{
a0377dfe 262 LTTNG_ASSERT(type < ha->nr_types);
927ca06a
DG
263
264 /* Init TLS state. */
265 uatomic_set(&URCU_TLS(health_state).last, 0);
266 uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
267 uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
268 uatomic_set(&URCU_TLS(health_state).current, 0);
269 uatomic_set(&URCU_TLS(health_state).flags, 0);
270 uatomic_set(&URCU_TLS(health_state).type, type);
271
272 /* Add it to the global TLS state list. */
8782cc74
MD
273 state_lock(ha);
274 cds_list_add(&URCU_TLS(health_state).node, &ha->list);
275 state_unlock(ha);
927ca06a
DG
276}
277
278/*
279 * Remove node from global list.
280 */
8782cc74 281void health_unregister(struct health_app *ha)
927ca06a 282{
8782cc74 283 state_lock(ha);
927ca06a
DG
284 /*
285 * On error, set the global_error_state since we are about to remove
286 * the node from the global list.
287 */
288 if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
8782cc74 289 uatomic_set(&ha->flags[URCU_TLS(health_state).type],
927ca06a
DG
290 HEALTH_ERROR);
291 }
292 cds_list_del(&URCU_TLS(health_state).node);
8782cc74 293 state_unlock(ha);
927ca06a 294}
This page took 0.066453 seconds and 4 git commands to generate.