clang-tidy: add Chrome-inspired checks
[lttng-tools.git] / src / common / health / health.cpp
... / ...
CommitLineData
1/*
2 * Copyright (C) 2012 David Goulet <dgoulet@efficios.com>
3 * Copyright (C) 2013 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 *
5 * SPDX-License-Identifier: GPL-2.0-only
6 *
7 */
8
9#define _LGPL_SOURCE
10#include <common/defaults.hpp>
11#include <common/error.hpp>
12#include <common/macros.hpp>
13#include <common/sessiond-comm/inet.hpp>
14
15#include <lttng/health-internal.hpp>
16
17#include <algorithm>
18#include <inttypes.h>
19#include <stdio.h>
20#include <stdlib.h>
21#include <time.h>
22
23/*
24 * An application-specific error state for unregistered thread keeps
25 * track of thread errors. A thread reporting a health error, normally
26 * unregisters and quits. This makes the TLS health state not available
27 * to the health_check_state() call so on unregister we update this
28 * global error array so we can keep track of which thread was on error
29 * if the TLS health state has been removed.
30 */
31struct health_app {
32 /* List of health state, for each application thread */
33 struct cds_list_head list;
34 /*
35 * This lock ensures that TLS memory used for the node and its
36 * container structure don't get reclaimed after the TLS owner
37 * thread exits until we have finished using it.
38 */
39 pthread_mutex_t lock;
40 int nr_types;
41 struct timespec time_delta;
42 /* Health flags containing thread type error state */
43 enum health_flags *flags;
44};
45
46/* Define TLS health state. */
47DEFINE_URCU_TLS(struct health_state, health_state);
48
49/*
50 * Initialize health check subsytem.
51 */
52static void health_init(struct health_app *ha)
53{
54 /*
55 * Get the maximum value between the default delta value and the TCP
56 * timeout with a safety net of the default health check delta.
57 */
58 ha->time_delta.tv_sec = std::max<unsigned long>(
59 lttcomm_inet_tcp_timeout + DEFAULT_HEALTH_CHECK_DELTA_S, ha->time_delta.tv_sec);
60 DBG("Health check time delta in seconds set to %lu", ha->time_delta.tv_sec);
61}
62
63struct health_app *health_app_create(int nr_types)
64{
65 struct health_app *ha;
66
67 ha = zmalloc<health_app>();
68 if (!ha) {
69 return nullptr;
70 }
71 ha->flags = calloc<health_flags>(nr_types);
72 if (!ha->flags) {
73 goto error_flags;
74 }
75 CDS_INIT_LIST_HEAD(&ha->list);
76 pthread_mutex_init(&ha->lock, nullptr);
77 ha->nr_types = nr_types;
78 ha->time_delta.tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S;
79 ha->time_delta.tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS;
80 health_init(ha);
81 return ha;
82
83error_flags:
84 free(ha);
85 return nullptr;
86}
87
88void health_app_destroy(struct health_app *ha)
89{
90 free(ha->flags);
91 free(ha);
92}
93
94/*
95 * Lock health state global list mutex.
96 */
97static void state_lock(struct health_app *ha)
98{
99 pthread_mutex_lock(&ha->lock);
100}
101
102/*
103 * Unlock health state global list mutex.
104 */
105static void state_unlock(struct health_app *ha)
106{
107 pthread_mutex_unlock(&ha->lock);
108}
109
110/*
111 * Set time difference in res from time_a and time_b.
112 */
113static void
114time_diff(const struct timespec *time_a, const struct timespec *time_b, struct timespec *res)
115{
116 if (time_a->tv_nsec - time_b->tv_nsec < 0) {
117 res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
118 res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
119 } else {
120 res->tv_sec = time_a->tv_sec - time_b->tv_sec;
121 res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
122 }
123}
124
125/*
126 * Return true if time_a - time_b > diff, else false.
127 */
128static int time_diff_gt(const struct timespec *time_a,
129 const struct timespec *time_b,
130 const struct timespec *diff)
131{
132 struct timespec res;
133
134 time_diff(time_a, time_b, &res);
135 time_diff(&res, diff, &res);
136
137 if (res.tv_sec > 0) {
138 return 1;
139 } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
140 return 1;
141 }
142
143 return 0;
144}
145
146/*
147 * Validate health state. Checks for the error flag or health conditions.
148 *
149 * Return 0 if health is bad or else 1.
150 */
151static int validate_state(struct health_app *ha, struct health_state *state)
152{
153 int retval = 1, ret;
154 unsigned long current, last;
155 struct timespec current_time;
156
157 LTTNG_ASSERT(state);
158
159 last = state->last;
160 current = uatomic_read(&state->current);
161
162 ret = lttng_clock_gettime(CLOCK_MONOTONIC, &current_time);
163 if (ret < 0) {
164 PERROR("Error reading time\n");
165 /* error */
166 retval = 0;
167 goto end;
168 }
169
170 /*
171 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
172 * health if, after the delta delay has passed, its the progress counter
173 * has not moved and it has NOT been waiting for a poll() call.
174 */
175 if (uatomic_read(&state->flags) & HEALTH_ERROR) {
176 retval = 0;
177 goto end;
178 }
179
180 /*
181 * Initial condition need to update the last counter and sample time, but
182 * should not check health in this initial case, because we don't know how
183 * much time has passed.
184 */
185 if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
186 /* update last counter and last sample time */
187 state->last = current;
188 memcpy(&state->last_time, &current_time, sizeof(current_time));
189 } else {
190 if (time_diff_gt(&current_time, &state->last_time, &ha->time_delta)) {
191 if (current == last && !HEALTH_IS_IN_POLL(current)) {
192 /* error */
193 retval = 0;
194 }
195 /* update last counter and last sample time */
196 state->last = current;
197 memcpy(&state->last_time, &current_time, sizeof(current_time));
198
199 /* On error, stop right now and notify caller. */
200 if (retval == 0) {
201 goto end;
202 }
203 }
204 }
205
206end:
207 DBG("Health state current %lu, last %lu, ret %d", current, last, ret);
208 return retval;
209}
210
211/*
212 * Check health of a specific health type. Note that if a thread has not yet
213 * initialize its health subsystem or has quit, it's considered in a good
214 * state.
215 *
216 * Return 0 if health is bad or else 1.
217 */
218int health_check_state(struct health_app *ha, int type)
219{
220 int retval = 1;
221 struct health_state *state;
222
223 LTTNG_ASSERT(type < ha->nr_types);
224
225 state_lock(ha);
226
227 cds_list_for_each_entry (state, &ha->list, node) {
228 int ret;
229
230 if (state->type != type) {
231 continue;
232 }
233
234 ret = validate_state(ha, state);
235 if (!ret) {
236 retval = 0;
237 goto end;
238 }
239 }
240
241 /* Check the global state since some state might not be visible anymore. */
242 if (ha->flags[type] & HEALTH_ERROR) {
243 retval = 0;
244 }
245
246end:
247 state_unlock(ha);
248
249 DBG("Health check for type %d is %s", (int) type, (retval == 0) ? "BAD" : "GOOD");
250 return retval;
251}
252
253/*
254 * Init health state.
255 */
256void health_register(struct health_app *ha, int type)
257{
258 LTTNG_ASSERT(type < ha->nr_types);
259
260 /* Init TLS state. */
261 uatomic_set(&URCU_TLS(health_state).last, 0);
262 uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
263 uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
264 uatomic_set(&URCU_TLS(health_state).current, 0);
265 uatomic_set(&URCU_TLS(health_state).flags, (health_flags) 0);
266 uatomic_set(&URCU_TLS(health_state).type, type);
267
268 /* Add it to the global TLS state list. */
269 state_lock(ha);
270 cds_list_add(&URCU_TLS(health_state).node, &ha->list);
271 state_unlock(ha);
272}
273
274/*
275 * Remove node from global list.
276 */
277void health_unregister(struct health_app *ha)
278{
279 state_lock(ha);
280 /*
281 * On error, set the global_error_state since we are about to remove
282 * the node from the global list.
283 */
284 if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
285 uatomic_set(&ha->flags[URCU_TLS(health_state).type], HEALTH_ERROR);
286 }
287 cds_list_del(&URCU_TLS(health_state).node);
288 state_unlock(ha);
289}
This page took 0.022862 seconds and 4 git commands to generate.