Commit | Line | Data |
---|---|---|
44a5e5eb DG |
1 | /* |
2 | * Copyright (C) 2012 - David Goulet <dgoulet@efficios.com> | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify it | |
5 | * under the terms of the GNU General Public License, version 2 only, as | |
6 | * published by the Free Software Foundation. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, but WITHOUT | |
9 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
10 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | |
11 | * more details. | |
12 | * | |
13 | * You should have received a copy of the GNU General Public License along with | |
14 | * this program; if not, write to the Free Software Foundation, Inc., 51 | |
15 | * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | |
16 | */ | |
17 | ||
18 | #define _GNU_SOURCE | |
19 | #include <assert.h> | |
20 | #include <inttypes.h> | |
21 | #include <stdio.h> | |
22 | #include <stdlib.h> | |
8809eec0 | 23 | #include <time.h> |
44a5e5eb | 24 | |
8809eec0 | 25 | #include <common/defaults.h> |
44a5e5eb DG |
26 | #include <common/error.h> |
27 | ||
28 | #include "health.h" | |
29 | ||
8809eec0 MD |
30 | static const struct timespec time_delta = { |
31 | .tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S, | |
32 | .tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS, | |
33 | }; | |
34 | ||
927ca06a DG |
35 | /* Define TLS health state. */ |
36 | DEFINE_URCU_TLS(struct health_state, health_state); | |
37 | ||
38 | /* | |
39 | * It ensures that TLS memory used for the node and its container structure | |
40 | * don't get reclaimed after the TLS owner thread exits until we have finished | |
41 | * using it. | |
42 | */ | |
43 | static pthread_mutex_t health_mutex = PTHREAD_MUTEX_INITIALIZER; | |
44 | ||
45 | static struct health_tls_state_list health_state_list = { | |
46 | .head = CDS_LIST_HEAD_INIT(health_state_list.head), | |
47 | }; | |
48 | ||
49 | /* | |
50 | * This keeps track of the error state for unregistered thread. A thread | |
51 | * reporting a health error, normally unregisters and quits. This makes the TLS | |
52 | * health state not available to the health_check_state() call so on unregister | |
53 | * we update this global error array so we can keep track of which thread was | |
54 | * on error if the TLS health state has been removed. | |
55 | */ | |
56 | static enum health_flags global_error_state[HEALTH_NUM_TYPE]; | |
57 | ||
58 | /* | |
59 | * Lock health state global list mutex. | |
60 | */ | |
61 | static void state_lock(void) | |
62 | { | |
63 | pthread_mutex_lock(&health_mutex); | |
64 | } | |
65 | ||
66 | /* | |
67 | * Unlock health state global list mutex. | |
68 | */ | |
69 | static void state_unlock(void) | |
70 | { | |
71 | pthread_mutex_unlock(&health_mutex); | |
72 | } | |
73 | ||
8809eec0 MD |
74 | /* |
75 | * Set time difference in res from time_a and time_b. | |
76 | */ | |
77 | static void time_diff(const struct timespec *time_a, | |
78 | const struct timespec *time_b, struct timespec *res) | |
79 | { | |
80 | if (time_a->tv_nsec - time_b->tv_nsec < 0) { | |
81 | res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1; | |
82 | res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec; | |
83 | } else { | |
84 | res->tv_sec = time_a->tv_sec - time_b->tv_sec; | |
931a97e5 | 85 | res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec; |
8809eec0 MD |
86 | } |
87 | } | |
88 | ||
89 | /* | |
90 | * Return true if time_a - time_b > diff, else false. | |
91 | */ | |
92 | static int time_diff_gt(const struct timespec *time_a, | |
93 | const struct timespec *time_b, const struct timespec *diff) | |
94 | { | |
95 | struct timespec res; | |
96 | ||
97 | time_diff(time_a, time_b, &res); | |
98 | time_diff(&res, diff, &res); | |
99 | ||
100 | if (res.tv_sec > 0) { | |
101 | return 1; | |
102 | } else if (res.tv_sec == 0 && res.tv_nsec > 0) { | |
103 | return 1; | |
104 | } | |
105 | ||
106 | return 0; | |
107 | } | |
108 | ||
44a5e5eb | 109 | /* |
c89add41 | 110 | * Validate health state. Checks for the error flag or health conditions. |
44a5e5eb DG |
111 | * |
112 | * Return 0 if health is bad or else 1. | |
113 | */ | |
c89add41 | 114 | static int validate_state(struct health_state *state) |
44a5e5eb | 115 | { |
8809eec0 | 116 | int retval = 1, ret; |
139ac872 | 117 | unsigned long current, last; |
8809eec0 | 118 | struct timespec current_time; |
927ca06a | 119 | |
c89add41 | 120 | assert(state); |
44a5e5eb | 121 | |
139ac872 | 122 | last = state->last; |
44a5e5eb | 123 | current = uatomic_read(&state->current); |
44a5e5eb | 124 | |
8809eec0 | 125 | ret = clock_gettime(CLOCK_MONOTONIC, ¤t_time); |
931a97e5 | 126 | if (ret < 0) { |
8809eec0 | 127 | PERROR("Error reading time\n"); |
139ac872 | 128 | /* error */ |
8809eec0 MD |
129 | retval = 0; |
130 | goto end; | |
44a5e5eb DG |
131 | } |
132 | ||
8809eec0 MD |
133 | /* |
134 | * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad | |
135 | * health if, after the delta delay has passed, its the progress counter | |
136 | * has not moved and it has NOT been waiting for a poll() call. | |
137 | */ | |
138 | if (uatomic_read(&state->flags) & HEALTH_ERROR) { | |
139 | retval = 0; | |
140 | goto end; | |
141 | } | |
44a5e5eb | 142 | |
139ac872 | 143 | /* |
8809eec0 MD |
144 | * Initial condition need to update the last counter and sample time, but |
145 | * should not check health in this initial case, because we don't know how | |
146 | * much time has passed. | |
139ac872 | 147 | */ |
8809eec0 MD |
148 | if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) { |
149 | /* update last counter and last sample time */ | |
150 | state->last = current; | |
151 | memcpy(&state->last_time, ¤t_time, sizeof(current_time)); | |
152 | } else { | |
153 | if (time_diff_gt(¤t_time, &state->last_time, &time_delta)) { | |
154 | if (current == last && !HEALTH_IS_IN_POLL(current)) { | |
155 | /* error */ | |
156 | retval = 0; | |
157 | } | |
158 | /* update last counter and last sample time */ | |
159 | state->last = current; | |
160 | memcpy(&state->last_time, ¤t_time, sizeof(current_time)); | |
c89add41 DG |
161 | |
162 | /* On error, stop right now and notify caller. */ | |
163 | if (retval == 0) { | |
164 | goto end; | |
165 | } | |
8809eec0 MD |
166 | } |
167 | } | |
168 | ||
169 | end: | |
77c7c900 | 170 | DBG("Health state current %lu, last %lu, ret %d", |
8809eec0 | 171 | current, last, ret); |
c89add41 DG |
172 | return retval; |
173 | } | |
174 | ||
175 | /* | |
176 | * Check health of a specific health type. Note that if a thread has not yet | |
177 | * initialize its health subsystem or has quit, it's considered in a good | |
178 | * state. | |
179 | * | |
180 | * Return 0 if health is bad or else 1. | |
181 | */ | |
182 | int health_check_state(enum health_type type) | |
183 | { | |
184 | int retval = 1; | |
185 | struct health_state *state; | |
186 | ||
187 | assert(type < HEALTH_NUM_TYPE); | |
188 | ||
189 | state_lock(); | |
190 | ||
191 | cds_list_for_each_entry(state, &health_state_list.head, node) { | |
192 | int ret; | |
193 | ||
194 | if (state->type != type) { | |
195 | continue; | |
196 | } | |
197 | ||
198 | ret = validate_state(state); | |
199 | if (!ret) { | |
200 | retval = 0; | |
201 | goto end; | |
202 | } | |
203 | } | |
204 | ||
205 | /* Check the global state since some state might not be visible anymore. */ | |
206 | if (global_error_state[type] & HEALTH_ERROR) { | |
207 | retval = 0; | |
208 | } | |
209 | ||
210 | end: | |
927ca06a | 211 | state_unlock(); |
139ac872 | 212 | |
c89add41 DG |
213 | DBG("Health check for type %d is %s", (int) type, |
214 | (retval == 0) ? "BAD" : "GOOD"); | |
8809eec0 | 215 | return retval; |
44a5e5eb | 216 | } |
927ca06a DG |
217 | |
218 | /* | |
219 | * Init health state. | |
220 | */ | |
221 | void health_register(enum health_type type) | |
222 | { | |
927ca06a DG |
223 | assert(type < HEALTH_NUM_TYPE); |
224 | ||
225 | /* Init TLS state. */ | |
226 | uatomic_set(&URCU_TLS(health_state).last, 0); | |
227 | uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0); | |
228 | uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0); | |
229 | uatomic_set(&URCU_TLS(health_state).current, 0); | |
230 | uatomic_set(&URCU_TLS(health_state).flags, 0); | |
231 | uatomic_set(&URCU_TLS(health_state).type, type); | |
232 | ||
233 | /* Add it to the global TLS state list. */ | |
234 | state_lock(); | |
927ca06a DG |
235 | cds_list_add(&URCU_TLS(health_state).node, &health_state_list.head); |
236 | state_unlock(); | |
237 | } | |
238 | ||
239 | /* | |
240 | * Remove node from global list. | |
241 | */ | |
242 | void health_unregister(void) | |
243 | { | |
244 | state_lock(); | |
245 | /* | |
246 | * On error, set the global_error_state since we are about to remove | |
247 | * the node from the global list. | |
248 | */ | |
249 | if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) { | |
250 | uatomic_set(&global_error_state[URCU_TLS(health_state).type], | |
251 | HEALTH_ERROR); | |
252 | } | |
253 | cds_list_del(&URCU_TLS(health_state).node); | |
254 | state_unlock(); | |
255 | } |