Fix: health subsystem issues with shared code
[lttng-tools.git] / src / bin / lttng-sessiond / health.c
CommitLineData
44a5e5eb
DG
1/*
2 * Copyright (C) 2012 - David Goulet <dgoulet@efficios.com>
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License, version 2 only, as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 51
15 * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16 */
17
18#define _GNU_SOURCE
19#include <assert.h>
20#include <inttypes.h>
21#include <stdio.h>
22#include <stdlib.h>
8809eec0 23#include <time.h>
44a5e5eb 24
8809eec0 25#include <common/defaults.h>
44a5e5eb
DG
26#include <common/error.h>
27
28#include "health.h"
29
8809eec0
MD
30static const struct timespec time_delta = {
31 .tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S,
32 .tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS,
33};
34
927ca06a
DG
35/* Define TLS health state. */
36DEFINE_URCU_TLS(struct health_state, health_state);
37
38/*
39 * It ensures that TLS memory used for the node and its container structure
40 * don't get reclaimed after the TLS owner thread exits until we have finished
41 * using it.
42 */
43static pthread_mutex_t health_mutex = PTHREAD_MUTEX_INITIALIZER;
44
45static struct health_tls_state_list health_state_list = {
46 .head = CDS_LIST_HEAD_INIT(health_state_list.head),
47};
48
49/*
50 * This keeps track of the error state for unregistered thread. A thread
51 * reporting a health error, normally unregisters and quits. This makes the TLS
52 * health state not available to the health_check_state() call so on unregister
53 * we update this global error array so we can keep track of which thread was
54 * on error if the TLS health state has been removed.
55 */
56static enum health_flags global_error_state[HEALTH_NUM_TYPE];
57
58/*
59 * Lock health state global list mutex.
60 */
61static void state_lock(void)
62{
63 pthread_mutex_lock(&health_mutex);
64}
65
66/*
67 * Unlock health state global list mutex.
68 */
69static void state_unlock(void)
70{
71 pthread_mutex_unlock(&health_mutex);
72}
73
8809eec0
MD
74/*
75 * Set time difference in res from time_a and time_b.
76 */
77static void time_diff(const struct timespec *time_a,
78 const struct timespec *time_b, struct timespec *res)
79{
80 if (time_a->tv_nsec - time_b->tv_nsec < 0) {
81 res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
82 res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
83 } else {
84 res->tv_sec = time_a->tv_sec - time_b->tv_sec;
931a97e5 85 res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
8809eec0
MD
86 }
87}
88
89/*
90 * Return true if time_a - time_b > diff, else false.
91 */
92static int time_diff_gt(const struct timespec *time_a,
93 const struct timespec *time_b, const struct timespec *diff)
94{
95 struct timespec res;
96
97 time_diff(time_a, time_b, &res);
98 time_diff(&res, diff, &res);
99
100 if (res.tv_sec > 0) {
101 return 1;
102 } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
103 return 1;
104 }
105
106 return 0;
107}
108
44a5e5eb 109/*
927ca06a
DG
110 * Health mutex MUST be held across use of the returned struct health_state to
111 * provide existence guarantee.
112 *
113 * Return the health_state object or NULL if not found.
114 */
115static struct health_state *find_health_state(enum health_type type)
116{
117 struct health_state *state;
118
119 /* Find the right health state in the global TLS list. */
120 cds_list_for_each_entry(state, &health_state_list.head, node) {
121 if (state->type == type) {
122 return state;
123 }
124 }
125
126 return NULL;
127}
128
129/*
130 * Check health of a specific health type. Note that if a thread has not yet
131 * initialize its health subsystem or has quit, it's considered in a good
132 * state.
44a5e5eb
DG
133 *
134 * Return 0 if health is bad or else 1.
135 */
927ca06a 136int health_check_state(enum health_type type)
44a5e5eb 137{
8809eec0 138 int retval = 1, ret;
139ac872 139 unsigned long current, last;
8809eec0 140 struct timespec current_time;
927ca06a
DG
141 struct health_state *state;
142
143 assert(type < HEALTH_NUM_TYPE);
44a5e5eb 144
927ca06a
DG
145 state_lock();
146
147 state = find_health_state(type);
148 if (!state) {
149 /* Check the global state since the state is not visiable anymore. */
150 if (global_error_state[type] & HEALTH_ERROR) {
151 retval = 0;
152 }
153 goto not_found;
154 }
44a5e5eb 155
139ac872 156 last = state->last;
44a5e5eb 157 current = uatomic_read(&state->current);
44a5e5eb 158
8809eec0 159 ret = clock_gettime(CLOCK_MONOTONIC, &current_time);
931a97e5 160 if (ret < 0) {
8809eec0 161 PERROR("Error reading time\n");
139ac872 162 /* error */
8809eec0
MD
163 retval = 0;
164 goto end;
44a5e5eb
DG
165 }
166
8809eec0
MD
167 /*
168 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
169 * health if, after the delta delay has passed, its the progress counter
170 * has not moved and it has NOT been waiting for a poll() call.
171 */
172 if (uatomic_read(&state->flags) & HEALTH_ERROR) {
173 retval = 0;
174 goto end;
175 }
44a5e5eb 176
139ac872 177 /*
8809eec0
MD
178 * Initial condition need to update the last counter and sample time, but
179 * should not check health in this initial case, because we don't know how
180 * much time has passed.
139ac872 181 */
8809eec0
MD
182 if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
183 /* update last counter and last sample time */
184 state->last = current;
185 memcpy(&state->last_time, &current_time, sizeof(current_time));
186 } else {
187 if (time_diff_gt(&current_time, &state->last_time, &time_delta)) {
188 if (current == last && !HEALTH_IS_IN_POLL(current)) {
189 /* error */
190 retval = 0;
191 }
192 /* update last counter and last sample time */
193 state->last = current;
194 memcpy(&state->last_time, &current_time, sizeof(current_time));
195 }
196 }
197
198end:
77c7c900 199 DBG("Health state current %lu, last %lu, ret %d",
8809eec0 200 current, last, ret);
927ca06a
DG
201not_found:
202 state_unlock();
139ac872 203
8809eec0 204 return retval;
44a5e5eb 205}
927ca06a
DG
206
207/*
208 * Init health state.
209 */
210void health_register(enum health_type type)
211{
212 struct health_state *state;
213
214 assert(type < HEALTH_NUM_TYPE);
215
216 /* Init TLS state. */
217 uatomic_set(&URCU_TLS(health_state).last, 0);
218 uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
219 uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
220 uatomic_set(&URCU_TLS(health_state).current, 0);
221 uatomic_set(&URCU_TLS(health_state).flags, 0);
222 uatomic_set(&URCU_TLS(health_state).type, type);
223
224 /* Add it to the global TLS state list. */
225 state_lock();
226 state = find_health_state(type);
227 /*
228 * Duplicates are not accepted, since lookups don't handle them at the
229 * moment.
230 */
231 assert(!state);
232
233 cds_list_add(&URCU_TLS(health_state).node, &health_state_list.head);
234 state_unlock();
235}
236
237/*
238 * Remove node from global list.
239 */
240void health_unregister(void)
241{
242 state_lock();
243 /*
244 * On error, set the global_error_state since we are about to remove
245 * the node from the global list.
246 */
247 if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
248 uatomic_set(&global_error_state[URCU_TLS(health_state).type],
249 HEALTH_ERROR);
250 }
251 cds_list_del(&URCU_TLS(health_state).node);
252 state_unlock();
253}
This page took 0.034012 seconds and 4 git commands to generate.