consumerd: send a buffer static sample on flush command
[lttng-tools.git] / src / common / health / health.cpp
CommitLineData
44a5e5eb 1/*
ab5be9fa
MJ
2 * Copyright (C) 2012 David Goulet <dgoulet@efficios.com>
3 * Copyright (C) 2013 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
44a5e5eb 4 *
ab5be9fa 5 * SPDX-License-Identifier: GPL-2.0-only
44a5e5eb 6 *
44a5e5eb
DG
7 */
8
6c1c0768 9#define _LGPL_SOURCE
8784a4d0 10#include <algorithm>
44a5e5eb
DG
11#include <inttypes.h>
12#include <stdio.h>
13#include <stdlib.h>
8809eec0 14#include <time.h>
44a5e5eb 15
c9e313bc
SM
16#include <common/defaults.hpp>
17#include <common/error.hpp>
18#include <common/macros.hpp>
19#include <common/sessiond-comm/inet.hpp>
44a5e5eb 20
c9e313bc 21#include <lttng/health-internal.hpp>
44a5e5eb 22
8782cc74
MD
23/*
24 * An application-specific error state for unregistered thread keeps
25 * track of thread errors. A thread reporting a health error, normally
26 * unregisters and quits. This makes the TLS health state not available
27 * to the health_check_state() call so on unregister we update this
28 * global error array so we can keep track of which thread was on error
29 * if the TLS health state has been removed.
30 */
31struct health_app {
32 /* List of health state, for each application thread */
33 struct cds_list_head list;
34 /*
35 * This lock ensures that TLS memory used for the node and its
36 * container structure don't get reclaimed after the TLS owner
37 * thread exits until we have finished using it.
38 */
39 pthread_mutex_t lock;
40 int nr_types;
41 struct timespec time_delta;
42 /* Health flags containing thread type error state */
43 enum health_flags *flags;
8809eec0
MD
44};
45
927ca06a
DG
46/* Define TLS health state. */
47DEFINE_URCU_TLS(struct health_state, health_state);
48
55d09795
MD
49/*
50 * Initialize health check subsytem.
51 */
52static
53void health_init(struct health_app *ha)
54{
55 /*
56 * Get the maximum value between the default delta value and the TCP
57 * timeout with a safety net of the default health check delta.
58 */
8784a4d0 59 ha->time_delta.tv_sec = std::max<unsigned long>(
55d09795
MD
60 lttcomm_inet_tcp_timeout + DEFAULT_HEALTH_CHECK_DELTA_S,
61 ha->time_delta.tv_sec);
62 DBG("Health check time delta in seconds set to %lu",
8784a4d0 63 ha->time_delta.tv_sec);
55d09795
MD
64}
65
8782cc74
MD
66struct health_app *health_app_create(int nr_types)
67{
68 struct health_app *ha;
927ca06a 69
64803277 70 ha = zmalloc<health_app>();
8782cc74
MD
71 if (!ha) {
72 return NULL;
73 }
64803277 74 ha->flags = calloc<health_flags>(nr_types);
8782cc74
MD
75 if (!ha->flags) {
76 goto error_flags;
77 }
78 CDS_INIT_LIST_HEAD(&ha->list);
79 pthread_mutex_init(&ha->lock, NULL);
80 ha->nr_types = nr_types;
81 ha->time_delta.tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S;
82 ha->time_delta.tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS;
55d09795 83 health_init(ha);
8782cc74
MD
84 return ha;
85
86error_flags:
87 free(ha);
88 return NULL;
89}
927ca06a 90
8782cc74
MD
91void health_app_destroy(struct health_app *ha)
92{
93 free(ha->flags);
94 free(ha);
95}
927ca06a
DG
96
97/*
98 * Lock health state global list mutex.
99 */
8782cc74 100static void state_lock(struct health_app *ha)
927ca06a 101{
8782cc74 102 pthread_mutex_lock(&ha->lock);
927ca06a
DG
103}
104
105/*
106 * Unlock health state global list mutex.
107 */
8782cc74 108static void state_unlock(struct health_app *ha)
927ca06a 109{
8782cc74 110 pthread_mutex_unlock(&ha->lock);
927ca06a
DG
111}
112
8809eec0
MD
113/*
114 * Set time difference in res from time_a and time_b.
115 */
116static void time_diff(const struct timespec *time_a,
117 const struct timespec *time_b, struct timespec *res)
118{
119 if (time_a->tv_nsec - time_b->tv_nsec < 0) {
120 res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
121 res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
122 } else {
123 res->tv_sec = time_a->tv_sec - time_b->tv_sec;
931a97e5 124 res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
8809eec0
MD
125 }
126}
127
128/*
129 * Return true if time_a - time_b > diff, else false.
130 */
131static int time_diff_gt(const struct timespec *time_a,
132 const struct timespec *time_b, const struct timespec *diff)
133{
134 struct timespec res;
135
136 time_diff(time_a, time_b, &res);
137 time_diff(&res, diff, &res);
138
139 if (res.tv_sec > 0) {
140 return 1;
141 } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
142 return 1;
143 }
144
145 return 0;
146}
147
44a5e5eb 148/*
c89add41 149 * Validate health state. Checks for the error flag or health conditions.
44a5e5eb
DG
150 *
151 * Return 0 if health is bad or else 1.
152 */
8782cc74 153static int validate_state(struct health_app *ha, struct health_state *state)
44a5e5eb 154{
8809eec0 155 int retval = 1, ret;
139ac872 156 unsigned long current, last;
8809eec0 157 struct timespec current_time;
927ca06a 158
a0377dfe 159 LTTNG_ASSERT(state);
44a5e5eb 160
139ac872 161 last = state->last;
44a5e5eb 162 current = uatomic_read(&state->current);
44a5e5eb 163
389fbf04 164 ret = lttng_clock_gettime(CLOCK_MONOTONIC, &current_time);
931a97e5 165 if (ret < 0) {
8809eec0 166 PERROR("Error reading time\n");
139ac872 167 /* error */
8809eec0
MD
168 retval = 0;
169 goto end;
44a5e5eb
DG
170 }
171
8809eec0
MD
172 /*
173 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
174 * health if, after the delta delay has passed, its the progress counter
175 * has not moved and it has NOT been waiting for a poll() call.
176 */
177 if (uatomic_read(&state->flags) & HEALTH_ERROR) {
178 retval = 0;
179 goto end;
180 }
44a5e5eb 181
139ac872 182 /*
8809eec0
MD
183 * Initial condition need to update the last counter and sample time, but
184 * should not check health in this initial case, because we don't know how
185 * much time has passed.
139ac872 186 */
8809eec0
MD
187 if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
188 /* update last counter and last sample time */
189 state->last = current;
190 memcpy(&state->last_time, &current_time, sizeof(current_time));
191 } else {
8782cc74
MD
192 if (time_diff_gt(&current_time, &state->last_time,
193 &ha->time_delta)) {
8809eec0
MD
194 if (current == last && !HEALTH_IS_IN_POLL(current)) {
195 /* error */
196 retval = 0;
197 }
198 /* update last counter and last sample time */
199 state->last = current;
200 memcpy(&state->last_time, &current_time, sizeof(current_time));
c89add41
DG
201
202 /* On error, stop right now and notify caller. */
203 if (retval == 0) {
204 goto end;
205 }
8809eec0
MD
206 }
207 }
208
209end:
77c7c900 210 DBG("Health state current %lu, last %lu, ret %d",
8809eec0 211 current, last, ret);
c89add41
DG
212 return retval;
213}
214
215/*
216 * Check health of a specific health type. Note that if a thread has not yet
217 * initialize its health subsystem or has quit, it's considered in a good
218 * state.
219 *
220 * Return 0 if health is bad or else 1.
221 */
8782cc74 222int health_check_state(struct health_app *ha, int type)
c89add41
DG
223{
224 int retval = 1;
225 struct health_state *state;
226
a0377dfe 227 LTTNG_ASSERT(type < ha->nr_types);
c89add41 228
8782cc74 229 state_lock(ha);
c89add41 230
8782cc74 231 cds_list_for_each_entry(state, &ha->list, node) {
c89add41
DG
232 int ret;
233
234 if (state->type != type) {
235 continue;
236 }
237
8782cc74 238 ret = validate_state(ha, state);
c89add41
DG
239 if (!ret) {
240 retval = 0;
241 goto end;
242 }
243 }
244
245 /* Check the global state since some state might not be visible anymore. */
8782cc74 246 if (ha->flags[type] & HEALTH_ERROR) {
c89add41
DG
247 retval = 0;
248 }
249
250end:
8782cc74 251 state_unlock(ha);
139ac872 252
c89add41
DG
253 DBG("Health check for type %d is %s", (int) type,
254 (retval == 0) ? "BAD" : "GOOD");
8809eec0 255 return retval;
44a5e5eb 256}
927ca06a
DG
257
258/*
259 * Init health state.
260 */
8782cc74 261void health_register(struct health_app *ha, int type)
927ca06a 262{
a0377dfe 263 LTTNG_ASSERT(type < ha->nr_types);
927ca06a
DG
264
265 /* Init TLS state. */
266 uatomic_set(&URCU_TLS(health_state).last, 0);
267 uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
268 uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
269 uatomic_set(&URCU_TLS(health_state).current, 0);
8784a4d0 270 uatomic_set(&URCU_TLS(health_state).flags, (health_flags) 0);
927ca06a
DG
271 uatomic_set(&URCU_TLS(health_state).type, type);
272
273 /* Add it to the global TLS state list. */
8782cc74
MD
274 state_lock(ha);
275 cds_list_add(&URCU_TLS(health_state).node, &ha->list);
276 state_unlock(ha);
927ca06a
DG
277}
278
279/*
280 * Remove node from global list.
281 */
8782cc74 282void health_unregister(struct health_app *ha)
927ca06a 283{
8782cc74 284 state_lock(ha);
927ca06a
DG
285 /*
286 * On error, set the global_error_state since we are about to remove
287 * the node from the global list.
288 */
289 if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
8782cc74 290 uatomic_set(&ha->flags[URCU_TLS(health_state).type],
927ca06a
DG
291 HEALTH_ERROR);
292 }
293 cds_list_del(&URCU_TLS(health_state).node);
8782cc74 294 state_unlock(ha);
927ca06a 295}
This page took 0.078456 seconds and 4 git commands to generate.