lttng-relayd: use TCP keep-alive mechanism to detect dead-peer
[lttng-tools.git] / src / bin / lttng-relayd / tcp_keep_alive.c
CommitLineData
f056029c
JR
1/*
2 * Copyright (C) 2017 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License, version 2 only,
6 * as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16 */
17
18#include <sys/types.h>
19#include <netinet/tcp.h>
20#include <stdbool.h>
21#include <sys/socket.h>
22#include <limits.h>
23
24#include <common/compat/getenv.h>
25#include <common/time.h>
26#include <common/defaults.h>
27#include <common/config/session-config.h>
28
29#include "tcp_keep_alive.h"
30
31#define SOLARIS_IDLE_TIME_MIN_S 10
32#define SOLARIS_IDLE_TIME_MAX_S 864000 /* 10 days */
33#define SOLARIS_ABORT_THRESHOLD_MIN_S 1
34#define SOLARIS_ABORT_THRESHOLD_MAX_S 480 /* 8 minutes */
35
36/* Per-platform definitions of TCP socket options. */
37#if defined (__linux__)
38
39#define COMPAT_SOCKET_LEVEL SOL_TCP
40#define COMPAT_TCP_LEVEL SOL_TCP
41#define COMPAT_TCP_ABORT_THRESHOLD 0 /* Does not exist on linux. */
42#define COMPAT_TCP_KEEPIDLE TCP_KEEPIDLE
43#define COMPAT_TCP_KEEPINTVL TCP_KEEPINTVL
44#define COMPAT_TCP_KEEPCNT TCP_KEEPCNT
45
46#elif defined (__sun__) /* ! defined (__linux__) */
47
48#define COMPAT_SOCKET_LEVEL SOL_SOCKET
49#define COMPAT_TCP_LEVEL IPPROTO_TCP
50
51#ifdef TCP_KEEPALIVE_THRESHOLD
52#define COMPAT_TCP_KEEPIDLE TCP_KEEPALIVE_THRESHOLD
53#else /* ! defined (TCP_KEEPALIVE_THRESHOLD) */
54#define COMPAT_TCP_KEEPIDLE 0
55#endif /* TCP_KEEPALIVE_THRESHOLD */
56
57#ifdef TCP_KEEPALIVE_ABORT_THRESHOLD
58#define COMPAT_TCP_ABORT_THRESHOLD TCP_KEEPALIVE_ABORT_THRESHOLD
59#else /* ! defined (TCP_KEEPALIVE_ABORT_THRESHOLD) */
60#define COMPAT_TCP_ABORT_THRESHOLD 0
61#endif /* TCP_KEEPALIVE_ABORT_THRESHOLD */
62
63#define COMPAT_TCP_KEEPINTVL 0 /* Does not exist on Solaris. */
64#define COMPAT_TCP_KEEPCNT 0 /* Does not exist on Solaris. */
65
66#else /* ! defined (__linux__) && ! defined (__sun__) */
67
68#define COMPAT_SOCKET_LEVEL 0
69#define COMPAT_TCP_LEVEL 0
70#define COMPAT_TCP_ABORT_THRESHOLD 0
71#define COMPAT_TCP_KEEPIDLE 0
72#define COMPAT_TCP_KEEPINTVL 0
73#define COMPAT_TCP_KEEPCNT 0
74
75#endif /* ! defined (__linux__) && ! defined (__sun__) */
76
77struct tcp_keep_alive_support {
78 /* TCP keep-alive is supported by this platform. */
79 bool supported;
80 /* Overriding idle-time per socket is supported by this platform. */
81 bool idle_time_supported;
82 /*
83 * Overriding probe interval per socket is supported by this
84 * platform.
85 */
86 bool probe_interval_supported;
87 /*
88 * Configuring max probe count per socket is supported by this
89 * platform.
90 */
91 bool max_probe_count_supported;
92 /* Overriding on a per-socket basis is supported by this platform. */
93 bool abort_threshold_supported;
94};
95
96struct tcp_keep_alive_config {
97 /* Maps to the LTTNG_RELAYD_TCP_KEEP_ALIVE_ENV environment variable. */
98 bool enabled;
99 /*
100 * Maps to the LTTNG_RELAYD_TCP_KEEP_ALIVE_IDLE_TIME_ENV environment
101 * variable.
102 */
103 int idle_time;
104 /*
105 * Maps to the LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBE_INTERVAL_ENV
106 * environment variable.
107 */
108 int probe_interval;
109 /*
110 * Maps to the LTTNG_RELAYD_TCP_KEEP_ALIVE_MAX_PROBE_COUNT_ENV
111 * environment variable.
112 */
113 int max_probe_count;
114 /*
115 * Maps to the LTTNG_RELAYD_TCP_KEEP_ALIVE_ABORT_THRESHOLD_ENV
116 * environment variable.
117 */
118 int abort_threshold;
119};
120
121static struct tcp_keep_alive_config config = {
122 .enabled = false,
123 .idle_time = -1,
124 .probe_interval = -1,
125 .max_probe_count = -1,
126 .abort_threshold = -1
127};
128
129static struct tcp_keep_alive_support support = {
130 .supported = false,
131 .idle_time_supported = false,
132 .probe_interval_supported = false,
133 .max_probe_count_supported = false,
134 .abort_threshold_supported = false
135};
136
137/*
138 * Common parser for string to positive int conversion where the value must be
139 * in range [-1, INT_MAX].
140 *
141 * Returns -2 on invalid value.
142 */
143static
144int get_env_int(const char *env_var,
145 const char *value)
146{
147 int ret;
148 long tmp;
149 char *endptr = NULL;
150
151 errno = 0;
152 tmp = strtol(value, &endptr, 0);
153 if (errno != 0) {
154 ERR("%s cannot be parsed.", env_var);
155 PERROR("errno for previous parsing failure");
156 ret = -2;
157 goto end;
158 }
159
160 if (endptr == value || *endptr != '\0') {
161 ERR("%s is not a valid number", env_var);
162 ret = -1;
163 goto end;
164 }
165
166 if (tmp < -1) {
167 ERR("%s must be greater or equal to -1", env_var);
168 ret = -2;
169 goto end;
170 }
171 if (tmp > INT_MAX){
172 ERR("%s is too big. Maximum value is %d", env_var, INT_MAX);
173 ret = -2;
174 goto end;
175 }
176
177 ret = (int) tmp;
178end:
179 return ret;
180}
181
182/*
183 * Per-platform implementation of tcp_keep_alive_idle_time_modifier.
184 * Returns -2 on invalid value.
185 */
186#ifdef __sun__
187
188static
189int convert_idle_time(int value)
190{
191 int ret;
192 unsigned int tmp_ms;
193
194 if (value == -1 || value == 0) {
195 /* Use system defaults */
196 ret = value;
197 goto end;
198 }
199
200 if (value < 0) {
201 ERR("Invalid tcp keep-alive idle time (%i)", value);
202 ret = -2;
203 goto end;
204 }
205
206 /*
207 * Additional constraints for Solaris 11.
208 * Minimum 10s, maximum 10 days. Defined by
209 * https://docs.oracle.com/cd/E23824_01/html/821-1475/tcp-7p.html#REFMAN7tcp-7p
210 */
211 if ((value < SOLARIS_IDLE_TIME_MIN_S ||
212 value > SOLARIS_IDLE_TIME_MAX_S)) {
213 ERR("%s must be comprised between %d and %d inclusively on Solaris",
214 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_IDLE_TIME_ENV,
215 SOLARIS_IDLE_TIME_MIN_S,
216 SOLARIS_IDLE_TIME_MAX_S);
217 ret = -2;
218 goto end;
219 }
220
221 /* On Solaris idle time is given in milliseconds. */
222 tmp_ms = ((unsigned int) value) * MSEC_PER_SEC;
223 if ((value != 0 && (tmp_ms / ((unsigned int) value)) != MSEC_PER_SEC)
224 || tmp_ms > INT_MAX) {
225 /* Overflow. */
226 const int max_value = INT_MAX / MSEC_PER_SEC;
227
228 ERR("%s is too big: maximum supported value is %d",
229 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_IDLE_TIME_ENV,
230 max_value);
231 ret = -2;
232 goto end;
233 }
234
235 /* tmp_ms is >= 0 and <= INT_MAX. Cast is safe. */
236 ret = (int) tmp_ms;
237end:
238 return ret;
239}
240
241#else /* ! defined(__sun__) */
242
243static
244int convert_idle_time(int value)
245{
246 return value;
247}
248
249#endif /* ! defined(__sun__) */
250
251/* Per-platform support of tcp_keep_alive functionality. */
252#if defined (__linux__)
253
254static
255void tcp_keep_alive_init_support(struct tcp_keep_alive_support *support)
256{
257 support->supported = true;
258 support->idle_time_supported = true;
259 support->probe_interval_supported = true;
260 support->max_probe_count_supported = true;
261 /* Solaris specific */
262 support->abort_threshold_supported = false;
263}
264
265#elif defined(__sun__) /* ! defined (__linux__) */
266
267static
268void tcp_keep_alive_init_support(struct tcp_keep_alive_support *support)
269{
270 support->supported = true;
271#ifdef TCP_KEEPALIVE_THRESHOLD
272 support->idle_time_supported = true;
273#else
274 support->idle_time_supported = false;;
275#endif /* TCP_KEEPALIVE_THRESHOLD */
276
277 /*
278 * Solaris does not support either tcp_keepalive_probes or
279 * tcp_keepalive_intvl.
280 * Inferring a value for TCP_KEEP_ALIVE_ABORT_THRESHOLD using
281 * (tcp_keepalive_probes * tcp_keepalive_intvl) could yield a good
282 * alternative, but Solaris does not detail the algorithm used (such as
283 * constant time retry like Linux).
284 *
285 * Ignore those settings on Solaris 11. We prefer exposing an
286 * environment variable only used on Solaris for the abort threshold.
287 */
288 support->probe_interval_supported = false;
289 support->max_probe_count_supported = false;
290#ifdef TCP_KEEPALIVE_ABORT_THRESHOLD
291 support->abort_threshold_supported = true;
292#else
293 support->abort_threshold_supported = false;
294#endif /* TCP_KEEPALIVE_THRESHOLD */
295}
296
297#else /* ! defined(__sun__) && ! defined(__linux__) */
298
299/* Assume nothing is supported on other platforms. */
300static
301void tcp_keep_alive_init_support(struct tcp_keep_alive_support *support)
302{
303 support->supported = false;
304 support->idle_time_supported = false;
305 support->probe_interval_supported = false;
306 support->max_probe_count_supported = false;
307 support->abort_threshold_supported = false;
308}
309
310#endif /* ! defined(__sun__) && ! defined(__linux__) */
311
312#ifdef __sun__
313
314/*
315 * Solaris specific modifier for abort threshold.
316 * Return -2 on error.
317 */
318static
319int convert_abort_threshold(int value)
320{
321 int ret;
322 unsigned int tmp_ms;
323
324 if (value == -1) {
325 /* Use system defaults */
326 ret = value;
327 goto end;
328 }
329
330 if (value < 0) {
331 ERR("Invalid tcp keep-alive abort threshold (%i)", value);
332 ret = -2;
333 goto end;
334 }
335
336 /*
337 * Additional constraints for Solaris 11.
338 *
339 * Between 0 and 8 minutes.
340 * https://docs.oracle.com/cd/E19120-01/open.solaris/819-2724/fsvdh/index.html
341 *
342 * Restrict from 1 seconds to 8 minutes sice the 0 value goes against
343 * the purpose of dead peers detection by never timing out when probing.
344 * It does NOT mean that the connection times out immediately.
345 */
346 if ((value < SOLARIS_ABORT_THRESHOLD_MIN_S || value > SOLARIS_ABORT_THRESHOLD_MAX_S)) {
347 ERR("%s must be comprised between %d and %d inclusively on Solaris",
348 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_ABORT_THRESHOLD_ENV,
349 SOLARIS_ABORT_THRESHOLD_MIN_S,
350 SOLARIS_ABORT_THRESHOLD_MAX_S);
351 ret = -2;
352 goto end;
353 }
354
355 /* Abort threshold is given in milliseconds. */
356 tmp_ms = ((unsigned int) value) * MSEC_PER_SEC;
357 if ((value != 0 && (tmp_ms / ((unsigned int) value)) != MSEC_PER_SEC)
358 || tmp_ms > INT_MAX) {
359 /* Overflow */
360 const int max_value = INT_MAX / MSEC_PER_SEC;
361
362 ERR("%s is too big: maximum supported value is %d",
363 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_ABORT_THRESHOLD_ENV,
364 max_value);
365 ret = -2;
366 goto end;
367 }
368
369 /* tmp_ms is >= 0 and <= INT_MAX. Cast is safe. */
370 ret = (int) tmp_ms;
371end:
372 return ret;
373}
374
375#else
376
377static
378int convert_abort_threshold(int value)
379{
380 return value;
381}
382
383#endif /* defined (__sun__) */
384
385/*
386 * Retrieve settings from environment variables and warn for settings not
387 * supported by the platform.
388 */
389static
390int tcp_keep_alive_init_config(struct tcp_keep_alive_support *support,
391 struct tcp_keep_alive_config *config)
392{
393 int ret;
394 const char *value;
395
396 value = lttng_secure_getenv(DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_ENV);
397 if (!support->supported) {
398 if (value) {
399 WARN("Using per-socket TCP keep-alive mechanism is not supported by this platform. Ignoring the %s environment variable.",
400 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_ENV);
401 }
402 config->enabled = false;
403 } else if (value) {
404 ret = config_parse_value(value);
405 if (ret < 0 || ret > 1) {
406 ERR("Invalid value for %s", DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_ENV);
407 ret = 1;
408 goto error;
409 }
410 config->enabled = ret;
411 }
412 DBG("TCP keep-alive mechanism %s", config->enabled ? "enabled": "disabled");
413
414 /* Get value for tcp_keepalive_time in seconds. */
415 value = lttng_secure_getenv(DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_IDLE_TIME_ENV);
416 if (!support->idle_time_supported && value) {
417 WARN("Overriding the TCP keep-alive idle time threshold per-socket is not supported by this platform. Ignoring the %s environment variable.",
418 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_IDLE_TIME_ENV);
419 config->idle_time = -1;
420 } else if (value) {
421 int idle_time_platform;
422 int idle_time_seconds;
423
424 idle_time_seconds = get_env_int(
425 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_IDLE_TIME_ENV,
426 value);
427 if (idle_time_seconds < -1) {
428 ret = 1;
429 goto error;
430 }
431
432 idle_time_platform = convert_idle_time(idle_time_seconds);
433 if (idle_time_platform < -1) {
434 ret = 1;
435 goto error;
436 }
437
438 config->idle_time = idle_time_platform;
439 DBG("Overriding %s to %d",
440 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_IDLE_TIME_ENV,
441 idle_time_seconds);
442 }
443
444 /* Get value for tcp_keepalive_intvl in seconds. */
445 value = lttng_secure_getenv(
446 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBE_INTERVAL_ENV);
447 if (!support->probe_interval_supported && value) {
448 WARN("Overriding the TCP keep-alive probe interval time per-socket is not supported by this platform. Ignoring the %s environment variable.",
449 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBE_INTERVAL_ENV);
450 config->probe_interval = -1;
451 } else if (value) {
452 int probe_interval;
453
454 probe_interval = get_env_int(DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBE_INTERVAL_ENV,
455 value);
456 if (probe_interval < -1) {
457 ret = 1;
458 goto error;
459 }
460
461 config->probe_interval = probe_interval;
462 DBG("Overriding %s to %d",
463 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBE_INTERVAL_ENV,
464 config->probe_interval);
465 }
466
467 /* Get value for tcp_keepalive_probes. */
468 value = lttng_secure_getenv(DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_MAX_PROBE_COUNT_ENV);
469 if (!support->max_probe_count_supported && value) {
470 WARN("Overriding the TCP keep-alive maximum probe count per-socket is not supported by this platform. Ignoring the %s environment variable.",
471 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_MAX_PROBE_COUNT_ENV);
472 config->max_probe_count = -1;
473 } else if (value) {
474 int max_probe_count;
475
476 max_probe_count = get_env_int(DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_MAX_PROBE_COUNT_ENV,
477 value);
478 if (max_probe_count < -1) {
479 ret = 1;
480 goto error;
481 }
482
483 config->max_probe_count = max_probe_count;
484 DBG("Overriding %s to %d",
485 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_MAX_PROBE_COUNT_ENV,
486 config->max_probe_count);
487 }
488
489 /* Get value for tcp_keepalive_abort_interval. */
490 value = lttng_secure_getenv(
491 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_ABORT_THRESHOLD_ENV);
492 if (!support->abort_threshold_supported && value) {
493 WARN("Overriding the TCP keep-alive abort threshold per-socket is not supported by this platform. Ignoring the %s environment variable.",
494 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_ABORT_THRESHOLD_ENV);
495 config->abort_threshold = -1;
496 } else if (value) {
497 int abort_threshold_platform;
498 int abort_threshold_seconds;
499
500 abort_threshold_seconds = get_env_int(
501 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_MAX_PROBE_COUNT_ENV,
502 value);
503 if (abort_threshold_seconds < -1) {
504 ret = 1;
505 goto error;
506 }
507
508 abort_threshold_platform = convert_abort_threshold(
509 abort_threshold_seconds);
510 if (abort_threshold_platform < -1) {
511 ret = 1;
512 goto error;
513 }
514
515 config->abort_threshold = abort_threshold_platform;
516 DBG("Overriding %s to %d",
517 DEFAULT_LTTNG_RELAYD_TCP_KEEP_ALIVE_ABORT_THRESHOLD_ENV,
518 config->abort_threshold);
519 }
520
521 ret = 0;
522
523error:
524 return ret;
525}
526
527/* Initialize the TCP keep-alive configuration. */
528__attribute__((constructor)) static
529int tcp_keep_alive_init(void)
530{
531 tcp_keep_alive_init_support(&support);
532 return tcp_keep_alive_init_config(&support, &config);
533}
534
535/*
536 * Set the socket options regarding TCP keep-alive.
537 */
538LTTNG_HIDDEN
539int socket_apply_keep_alive_config(int socket_fd)
540{
541 int ret;
542 int val = 1;
543
544 /* TCP keep-alive */
545 if (!support.supported || !config.enabled ) {
546 ret = 0;
547 goto end;
548 }
549
550 ret = setsockopt(socket_fd, COMPAT_SOCKET_LEVEL, SO_KEEPALIVE, &val,
551 sizeof(val));
552 if (ret < 0) {
553 PERROR("setsockopt so_keepalive");
554 goto end;
555 }
556
557 /* TCP keep-alive idle time */
558 if (support.idle_time_supported && config.idle_time > 0) {
559 ret = setsockopt(socket_fd, COMPAT_TCP_LEVEL, COMPAT_TCP_KEEPIDLE, &config.idle_time,
560 sizeof(config.idle_time));
561 if (ret < 0) {
562 PERROR("setsockopt TCP_KEEPIDLE");
563 goto end;
564 }
565 }
566 /* TCP keep-alive probe interval */
567 if (support.probe_interval_supported && config.probe_interval > 0) {
568 ret = setsockopt(socket_fd, COMPAT_TCP_LEVEL, COMPAT_TCP_KEEPINTVL, &config.probe_interval,
569 sizeof(config.probe_interval));
570 if (ret < 0) {
571 PERROR("setsockopt TCP_KEEPINTVL");
572 goto end;
573 }
574 }
575
576 /* TCP keep-alive max probe count */
577 if (support.max_probe_count_supported && config.max_probe_count > 0) {
578 ret = setsockopt(socket_fd, COMPAT_TCP_LEVEL, COMPAT_TCP_KEEPCNT, &config.max_probe_count,
579 sizeof(config.max_probe_count));
580 if (ret < 0) {
581 PERROR("setsockopt TCP_KEEPCNT");
582 goto end;
583 }
584 }
585
586 /* TCP keep-alive abort threshold */
587 if (support.abort_threshold_supported && config.abort_threshold > 0) {
588 ret = setsockopt(socket_fd, COMPAT_TCP_LEVEL, COMPAT_TCP_ABORT_THRESHOLD, &config.abort_threshold,
589 sizeof(config.max_probe_count));
590 if (ret < 0) {
591 PERROR("setsockopt TCP_KEEPALIVE_ABORT_THRESHOLD");
592 goto end;
593 }
594 }
595end:
596 return ret;
597}
This page took 0.043364 seconds and 4 git commands to generate.