Fix: health subsystem issues with shared code
[lttng-tools.git] / src / bin / lttng-sessiond / health.c
1 /*
2 * Copyright (C) 2012 - David Goulet <dgoulet@efficios.com>
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License, version 2 only, as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 51
15 * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16 */
17
18 #define _GNU_SOURCE
19 #include <assert.h>
20 #include <inttypes.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <time.h>
24
25 #include <common/defaults.h>
26 #include <common/error.h>
27
28 #include "health.h"
29
30 static const struct timespec time_delta = {
31 .tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S,
32 .tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS,
33 };
34
35 /* Define TLS health state. */
36 DEFINE_URCU_TLS(struct health_state, health_state);
37
38 /*
39 * It ensures that TLS memory used for the node and its container structure
40 * don't get reclaimed after the TLS owner thread exits until we have finished
41 * using it.
42 */
43 static pthread_mutex_t health_mutex = PTHREAD_MUTEX_INITIALIZER;
44
45 static struct health_tls_state_list health_state_list = {
46 .head = CDS_LIST_HEAD_INIT(health_state_list.head),
47 };
48
49 /*
50 * This keeps track of the error state for unregistered thread. A thread
51 * reporting a health error, normally unregisters and quits. This makes the TLS
52 * health state not available to the health_check_state() call so on unregister
53 * we update this global error array so we can keep track of which thread was
54 * on error if the TLS health state has been removed.
55 */
56 static enum health_flags global_error_state[HEALTH_NUM_TYPE];
57
58 /*
59 * Lock health state global list mutex.
60 */
61 static void state_lock(void)
62 {
63 pthread_mutex_lock(&health_mutex);
64 }
65
66 /*
67 * Unlock health state global list mutex.
68 */
69 static void state_unlock(void)
70 {
71 pthread_mutex_unlock(&health_mutex);
72 }
73
74 /*
75 * Set time difference in res from time_a and time_b.
76 */
77 static void time_diff(const struct timespec *time_a,
78 const struct timespec *time_b, struct timespec *res)
79 {
80 if (time_a->tv_nsec - time_b->tv_nsec < 0) {
81 res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
82 res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
83 } else {
84 res->tv_sec = time_a->tv_sec - time_b->tv_sec;
85 res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
86 }
87 }
88
89 /*
90 * Return true if time_a - time_b > diff, else false.
91 */
92 static int time_diff_gt(const struct timespec *time_a,
93 const struct timespec *time_b, const struct timespec *diff)
94 {
95 struct timespec res;
96
97 time_diff(time_a, time_b, &res);
98 time_diff(&res, diff, &res);
99
100 if (res.tv_sec > 0) {
101 return 1;
102 } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
103 return 1;
104 }
105
106 return 0;
107 }
108
109 /*
110 * Health mutex MUST be held across use of the returned struct health_state to
111 * provide existence guarantee.
112 *
113 * Return the health_state object or NULL if not found.
114 */
115 static struct health_state *find_health_state(enum health_type type)
116 {
117 struct health_state *state;
118
119 /* Find the right health state in the global TLS list. */
120 cds_list_for_each_entry(state, &health_state_list.head, node) {
121 if (state->type == type) {
122 return state;
123 }
124 }
125
126 return NULL;
127 }
128
129 /*
130 * Check health of a specific health type. Note that if a thread has not yet
131 * initialize its health subsystem or has quit, it's considered in a good
132 * state.
133 *
134 * Return 0 if health is bad or else 1.
135 */
136 int health_check_state(enum health_type type)
137 {
138 int retval = 1, ret;
139 unsigned long current, last;
140 struct timespec current_time;
141 struct health_state *state;
142
143 assert(type < HEALTH_NUM_TYPE);
144
145 state_lock();
146
147 state = find_health_state(type);
148 if (!state) {
149 /* Check the global state since the state is not visiable anymore. */
150 if (global_error_state[type] & HEALTH_ERROR) {
151 retval = 0;
152 }
153 goto not_found;
154 }
155
156 last = state->last;
157 current = uatomic_read(&state->current);
158
159 ret = clock_gettime(CLOCK_MONOTONIC, &current_time);
160 if (ret < 0) {
161 PERROR("Error reading time\n");
162 /* error */
163 retval = 0;
164 goto end;
165 }
166
167 /*
168 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
169 * health if, after the delta delay has passed, its the progress counter
170 * has not moved and it has NOT been waiting for a poll() call.
171 */
172 if (uatomic_read(&state->flags) & HEALTH_ERROR) {
173 retval = 0;
174 goto end;
175 }
176
177 /*
178 * Initial condition need to update the last counter and sample time, but
179 * should not check health in this initial case, because we don't know how
180 * much time has passed.
181 */
182 if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
183 /* update last counter and last sample time */
184 state->last = current;
185 memcpy(&state->last_time, &current_time, sizeof(current_time));
186 } else {
187 if (time_diff_gt(&current_time, &state->last_time, &time_delta)) {
188 if (current == last && !HEALTH_IS_IN_POLL(current)) {
189 /* error */
190 retval = 0;
191 }
192 /* update last counter and last sample time */
193 state->last = current;
194 memcpy(&state->last_time, &current_time, sizeof(current_time));
195 }
196 }
197
198 end:
199 DBG("Health state current %lu, last %lu, ret %d",
200 current, last, ret);
201 not_found:
202 state_unlock();
203
204 return retval;
205 }
206
207 /*
208 * Init health state.
209 */
210 void health_register(enum health_type type)
211 {
212 struct health_state *state;
213
214 assert(type < HEALTH_NUM_TYPE);
215
216 /* Init TLS state. */
217 uatomic_set(&URCU_TLS(health_state).last, 0);
218 uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
219 uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
220 uatomic_set(&URCU_TLS(health_state).current, 0);
221 uatomic_set(&URCU_TLS(health_state).flags, 0);
222 uatomic_set(&URCU_TLS(health_state).type, type);
223
224 /* Add it to the global TLS state list. */
225 state_lock();
226 state = find_health_state(type);
227 /*
228 * Duplicates are not accepted, since lookups don't handle them at the
229 * moment.
230 */
231 assert(!state);
232
233 cds_list_add(&URCU_TLS(health_state).node, &health_state_list.head);
234 state_unlock();
235 }
236
237 /*
238 * Remove node from global list.
239 */
240 void health_unregister(void)
241 {
242 state_lock();
243 /*
244 * On error, set the global_error_state since we are about to remove
245 * the node from the global list.
246 */
247 if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
248 uatomic_set(&global_error_state[URCU_TLS(health_state).type],
249 HEALTH_ERROR);
250 }
251 cds_list_del(&URCU_TLS(health_state).node);
252 state_unlock();
253 }
This page took 0.034875 seconds and 5 git commands to generate.