2 * Copyright (C) 2012 - David Goulet <dgoulet@efficios.com>
3 * Copyright (C) 2013 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License, version 2 only, as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 51
16 * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 #include <common/defaults.h>
27 #include <common/error.h>
28 #include <common/macros.h>
29 #include <common/sessiond-comm/inet.h>
31 #include <lttng/health-internal.h>
34 * An application-specific error state for unregistered thread keeps
35 * track of thread errors. A thread reporting a health error, normally
36 * unregisters and quits. This makes the TLS health state not available
37 * to the health_check_state() call so on unregister we update this
38 * global error array so we can keep track of which thread was on error
39 * if the TLS health state has been removed.
42 /* List of health state, for each application thread */
43 struct cds_list_head list
;
45 * This lock ensures that TLS memory used for the node and its
46 * container structure don't get reclaimed after the TLS owner
47 * thread exits until we have finished using it.
51 struct timespec time_delta
;
52 /* Health flags containing thread type error state */
53 enum health_flags
*flags
;
56 /* Define TLS health state. */
57 DEFINE_URCU_TLS(struct health_state
, health_state
);
60 * Initialize health check subsytem.
63 void health_init(struct health_app
*ha
)
66 * Get the maximum value between the default delta value and the TCP
67 * timeout with a safety net of the default health check delta.
69 ha
->time_delta
.tv_sec
= max_t(unsigned long,
70 lttcomm_inet_tcp_timeout
+ DEFAULT_HEALTH_CHECK_DELTA_S
,
71 ha
->time_delta
.tv_sec
);
72 DBG("Health check time delta in seconds set to %lu",
73 ha
->time_delta
.tv_sec
);
76 struct health_app
*health_app_create(int nr_types
)
78 struct health_app
*ha
;
80 ha
= zmalloc(sizeof(*ha
));
84 ha
->flags
= zmalloc(sizeof(*ha
->flags
) * nr_types
);
88 CDS_INIT_LIST_HEAD(&ha
->list
);
89 pthread_mutex_init(&ha
->lock
, NULL
);
90 ha
->nr_types
= nr_types
;
91 ha
->time_delta
.tv_sec
= DEFAULT_HEALTH_CHECK_DELTA_S
;
92 ha
->time_delta
.tv_nsec
= DEFAULT_HEALTH_CHECK_DELTA_NS
;
101 void health_app_destroy(struct health_app
*ha
)
108 * Lock health state global list mutex.
110 static void state_lock(struct health_app
*ha
)
112 pthread_mutex_lock(&ha
->lock
);
116 * Unlock health state global list mutex.
118 static void state_unlock(struct health_app
*ha
)
120 pthread_mutex_unlock(&ha
->lock
);
124 * Set time difference in res from time_a and time_b.
126 static void time_diff(const struct timespec
*time_a
,
127 const struct timespec
*time_b
, struct timespec
*res
)
129 if (time_a
->tv_nsec
- time_b
->tv_nsec
< 0) {
130 res
->tv_sec
= time_a
->tv_sec
- time_b
->tv_sec
- 1;
131 res
->tv_nsec
= 1000000000L + time_a
->tv_sec
- time_b
->tv_sec
;
133 res
->tv_sec
= time_a
->tv_sec
- time_b
->tv_sec
;
134 res
->tv_nsec
= time_a
->tv_nsec
- time_b
->tv_nsec
;
139 * Return true if time_a - time_b > diff, else false.
141 static int time_diff_gt(const struct timespec
*time_a
,
142 const struct timespec
*time_b
, const struct timespec
*diff
)
146 time_diff(time_a
, time_b
, &res
);
147 time_diff(&res
, diff
, &res
);
149 if (res
.tv_sec
> 0) {
151 } else if (res
.tv_sec
== 0 && res
.tv_nsec
> 0) {
159 * Validate health state. Checks for the error flag or health conditions.
161 * Return 0 if health is bad or else 1.
163 static int validate_state(struct health_app
*ha
, struct health_state
*state
)
166 unsigned long current
, last
;
167 struct timespec current_time
;
172 current
= uatomic_read(&state
->current
);
174 ret
= clock_gettime(CLOCK_MONOTONIC
, ¤t_time
);
176 PERROR("Error reading time\n");
183 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
184 * health if, after the delta delay has passed, its the progress counter
185 * has not moved and it has NOT been waiting for a poll() call.
187 if (uatomic_read(&state
->flags
) & HEALTH_ERROR
) {
193 * Initial condition need to update the last counter and sample time, but
194 * should not check health in this initial case, because we don't know how
195 * much time has passed.
197 if (state
->last_time
.tv_sec
== 0 && state
->last_time
.tv_nsec
== 0) {
198 /* update last counter and last sample time */
199 state
->last
= current
;
200 memcpy(&state
->last_time
, ¤t_time
, sizeof(current_time
));
202 if (time_diff_gt(¤t_time
, &state
->last_time
,
204 if (current
== last
&& !HEALTH_IS_IN_POLL(current
)) {
208 /* update last counter and last sample time */
209 state
->last
= current
;
210 memcpy(&state
->last_time
, ¤t_time
, sizeof(current_time
));
212 /* On error, stop right now and notify caller. */
220 DBG("Health state current %lu, last %lu, ret %d",
226 * Check health of a specific health type. Note that if a thread has not yet
227 * initialize its health subsystem or has quit, it's considered in a good
230 * Return 0 if health is bad or else 1.
232 int health_check_state(struct health_app
*ha
, int type
)
235 struct health_state
*state
;
237 assert(type
< ha
->nr_types
);
241 cds_list_for_each_entry(state
, &ha
->list
, node
) {
244 if (state
->type
!= type
) {
248 ret
= validate_state(ha
, state
);
255 /* Check the global state since some state might not be visible anymore. */
256 if (ha
->flags
[type
] & HEALTH_ERROR
) {
263 DBG("Health check for type %d is %s", (int) type
,
264 (retval
== 0) ? "BAD" : "GOOD");
271 void health_register(struct health_app
*ha
, int type
)
273 assert(type
< ha
->nr_types
);
275 /* Init TLS state. */
276 uatomic_set(&URCU_TLS(health_state
).last
, 0);
277 uatomic_set(&URCU_TLS(health_state
).last_time
.tv_sec
, 0);
278 uatomic_set(&URCU_TLS(health_state
).last_time
.tv_nsec
, 0);
279 uatomic_set(&URCU_TLS(health_state
).current
, 0);
280 uatomic_set(&URCU_TLS(health_state
).flags
, 0);
281 uatomic_set(&URCU_TLS(health_state
).type
, type
);
283 /* Add it to the global TLS state list. */
285 cds_list_add(&URCU_TLS(health_state
).node
, &ha
->list
);
290 * Remove node from global list.
292 void health_unregister(struct health_app
*ha
)
296 * On error, set the global_error_state since we are about to remove
297 * the node from the global list.
299 if (uatomic_read(&URCU_TLS(health_state
).flags
) & HEALTH_ERROR
) {
300 uatomic_set(&ha
->flags
[URCU_TLS(health_state
).type
],
303 cds_list_del(&URCU_TLS(health_state
).node
);