Fix: hash table cleanup call_rcu deadlock
authorMathieu Desnoyers <mathieu.desnoyers@efficios.com>
Fri, 14 Jun 2013 11:44:52 +0000 (07:44 -0400)
committerDavid Goulet <dgoulet@efficios.com>
Fri, 14 Jun 2013 13:09:38 +0000 (09:09 -0400)
Implement hash table cleanup thread so HT cleanup never deadlocks on
resize in progress.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: David Goulet <dgoulet@efficios.com>
14 files changed:
include/lttng/lttng.h
src/bin/lttng-sessiond/Makefile.am
src/bin/lttng-sessiond/buffer-registry.c
src/bin/lttng-sessiond/consumer.c
src/bin/lttng-sessiond/health.h
src/bin/lttng-sessiond/ht-cleanup.c [new file with mode: 0644]
src/bin/lttng-sessiond/lttng-sessiond.h
src/bin/lttng-sessiond/main.c
src/bin/lttng-sessiond/trace-ust.c
src/bin/lttng-sessiond/ust-app.c
src/bin/lttng-sessiond/ust-registry.c
src/bin/lttng-sessiond/utils.c
src/bin/lttng-sessiond/utils.h
tests/unit/Makefile.am

index 08a3be467958d2c9cb9a9d7c917dc4f2862fd376..bcccdb47733722420a6997c506df00d06913c60e 100644 (file)
@@ -139,6 +139,7 @@ enum lttng_health_component {
        LTTNG_HEALTH_APP_REG,
        LTTNG_HEALTH_KERNEL,
        LTTNG_HEALTH_CONSUMER,
+       LTTNG_HEALTH_HT_CLEANUP,
        LTTNG_HEALTH_ALL,
 };
 
index 244bc70ebfd0a2a5d61a3777fe75afed2f968fcc..ae8b6eb1e476ef8310a75ee473fa7cbba3afd502 100644 (file)
@@ -25,7 +25,7 @@ lttng_sessiond_SOURCES = utils.c utils.h \
                        health.c health.h \
                        cmd.c cmd.h \
                        buffer-registry.c buffer-registry.h \
-                       testpoint.h
+                       testpoint.h ht-cleanup.c
 
 if HAVE_LIBLTTNG_UST_CTL
 lttng_sessiond_SOURCES += trace-ust.c ust-registry.c ust-app.c \
index 6a35284fe70eab9435052a86874537d2936b40f7..ad103d1dec36e2afcf7cfd1f9861feda044aa198 100644 (file)
@@ -25,6 +25,7 @@
 #include "fd-limit.h"
 #include "ust-consumer.h"
 #include "ust-ctl.h"
+#include "utils.h"
 
 /*
  * Set in main.c during initialization process of the daemon. This contains
@@ -548,7 +549,7 @@ static void buffer_reg_session_destroy(struct buffer_reg_session *regp,
        }
        rcu_read_unlock();
 
-       lttng_ht_destroy(regp->channels);
+       ht_cleanup_push(regp->channels);
 
        switch (domain) {
        case LTTNG_DOMAIN_UST:
@@ -692,6 +693,6 @@ void buffer_reg_pid_destroy(struct buffer_reg_pid *regp)
 void buffer_reg_destroy_registries(void)
 {
        DBG3("Buffer registry destroy all registry");
-       lttng_ht_destroy(buffer_registry_uid);
-       lttng_ht_destroy(buffer_registry_pid);
+       ht_cleanup_push(buffer_registry_uid);
+       ht_cleanup_push(buffer_registry_pid);
 }
index 245fda5628d42c707ee4097e5757e143ffbfb32a..2e20878856fd1ee1ab1637b6ff297044bda58d77 100644 (file)
@@ -32,6 +32,7 @@
 #include "consumer.h"
 #include "health.h"
 #include "ust-app.h"
+#include "utils.h"
 
 /*
  * Receive a reply command status message from the consumer. Consumer socket
@@ -428,7 +429,7 @@ void consumer_destroy_output(struct consumer_output *obj)
                rcu_read_unlock();
 
                /* Finally destroy HT */
-               lttng_ht_destroy(obj->socks);
+               ht_cleanup_push(obj->socks);
        }
 
        free(obj);
index 28d7dc64959387703dcb10e001b945cb1f6e5995..0b5fb4644daf19c0c7dbc31c9c99122d77ea1c32 100644 (file)
@@ -44,6 +44,7 @@ enum health_type {
        HEALTH_TYPE_APP_REG             = 2,
        HEALTH_TYPE_KERNEL              = 3,
        HEALTH_TYPE_CONSUMER    = 4,
+       HEALTH_TYPE_HT_CLEANUP          = 5,
 
        HEALTH_NUM_TYPE,
 };
diff --git a/src/bin/lttng-sessiond/ht-cleanup.c b/src/bin/lttng-sessiond/ht-cleanup.c
new file mode 100644 (file)
index 0000000..7a33840
--- /dev/null
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2013 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#define _GNU_SOURCE
+#include <assert.h>
+
+#include <common/hashtable/hashtable.h>
+#include <common/common.h>
+#include <common/utils.h>
+
+#include "lttng-sessiond.h"
+#include "health.h"
+
+void *thread_ht_cleanup(void *data)
+{
+       int ret, i, pollfd, err = -1;
+       uint32_t revents, nb_fd;
+       struct lttng_poll_event events;
+
+       DBG("[ht-thread] startup.");
+
+       rcu_register_thread();
+       rcu_thread_online();
+
+       health_register(HEALTH_TYPE_HT_CLEANUP);
+
+       health_code_update();
+
+       ret = sessiond_set_thread_pollset(&events, 2);
+       if (ret < 0) {
+               goto error_poll_create;
+       }
+
+       /* Add pipe to the pollset. */
+       ret = lttng_poll_add(&events, ht_cleanup_pipe[0], LPOLLIN | LPOLLERR);
+       if (ret < 0) {
+               goto error;
+       }
+
+       health_code_update();
+
+       while (1) {
+               DBG3("[ht-thread] Polling on %d fds.",
+                       LTTNG_POLL_GETNB(&events));
+
+               /* Inifinite blocking call, waiting for transmission */
+restart:
+               health_poll_entry();
+               ret = lttng_poll_wait(&events, -1);
+               health_poll_exit();
+               if (ret < 0) {
+                       /*
+                        * Restart interrupted system call.
+                        */
+                       if (errno == EINTR) {
+                               goto restart;
+                       }
+                       goto error;
+               }
+
+               nb_fd = ret;
+
+               for (i = 0; i < nb_fd; i++) {
+                       struct lttng_ht *ht;
+
+                       health_code_update();
+
+                       /* Fetch once the poll data */
+                       revents = LTTNG_POLL_GETEV(&events, i);
+                       pollfd = LTTNG_POLL_GETFD(&events, i);
+
+                       /* Thread quit pipe has been closed. Killing thread. */
+                       ret = sessiond_check_thread_quit_pipe(pollfd, revents);
+                       if (ret) {
+                               err = 0;
+                               goto exit;
+                       }
+                       assert(pollfd == ht_cleanup_pipe[0]);
+
+                       if (revents & (LPOLLERR | LPOLLHUP | LPOLLRDHUP)) {
+                               ERR("ht cleanup pipe error");
+                               goto error;
+                       } else if (!(revents & LPOLLIN)) {
+                               /* No POLLIN and not a catched error, stop the thread. */
+                               ERR("ht cleanup failed. revent: %u", revents);
+                               goto error;
+                       }
+
+                       do {
+                               /* Get socket from dispatch thread. */
+                               ret = read(ht_cleanup_pipe[0], &ht, sizeof(ht));
+                       } while (ret < 0 && errno == EINTR);
+                       if (ret < 0 || ret < sizeof(ht)) {
+                               PERROR("ht cleanup notify pipe");
+                               goto error;
+                       }
+                       health_code_update();
+                       /*
+                        * The whole point of this thread is to call
+                        * lttng_ht_destroy from a context that is NOT:
+                        * 1) a read-side RCU lock,
+                        * 2) a call_rcu thread.
+                        */
+                       lttng_ht_destroy(ht);
+
+                       health_code_update();
+               }
+       }
+
+exit:
+error:
+       lttng_poll_clean(&events);
+error_poll_create:
+       utils_close_pipe(ht_cleanup_pipe);
+       ht_cleanup_pipe[0] = ht_cleanup_pipe[1] = -1;
+       DBG("[ust-thread] cleanup complete.");
+       if (err) {
+               health_error();
+               ERR("Health error occurred in %s", __func__);
+       }
+       health_unregister();
+       rcu_thread_offline();
+       rcu_unregister_thread();
+       return NULL;
+}
index a0916c5663cdb4394f581f8c2abcaaec54fa8c45..6090e086c4f257311c57f068aa75ef71c585c393 100644 (file)
@@ -71,6 +71,15 @@ struct ust_cmd_queue {
  */
 extern int apps_cmd_notify_pipe[2];
 
+/*
+ * Used to notify that a hash table needs to be destroyed by dedicated
+ * thread. Required by design because we don't want to move destroy
+ * paths outside of large RCU read-side lock paths, and destroy cannot
+ * be called by call_rcu thread, because it may hang (waiting for
+ * call_rcu completion).
+ */
+extern int ht_cleanup_pipe[2];
+
 /*
  * Populated when the daemon starts with the current page size of the system.
  */
@@ -79,4 +88,6 @@ extern long page_size;
 int sessiond_set_thread_pollset(struct lttng_poll_event *events, size_t size);
 int sessiond_check_thread_quit_pipe(int fd, uint32_t events);
 
+void *thread_ht_cleanup(void *data);
+
 #endif /* _LTT_SESSIOND_H */
index 4470afc5e8dd1cf51428f9b5ce22d5ce2e5c217f..4a2bc4fdd3ebd363c6a4fb35bd4a2de3039e8cb8 100644 (file)
@@ -161,6 +161,7 @@ static pthread_t client_thread;
 static pthread_t kernel_thread;
 static pthread_t dispatch_thread;
 static pthread_t health_thread;
+static pthread_t ht_cleanup_thread;
 
 /*
  * UST registration command queue. This queue is tied with a futex and uses a N
@@ -3225,13 +3226,17 @@ restart:
                case LTTNG_HEALTH_CONSUMER:
                        reply.ret_code = check_consumer_health();
                        break;
+               case LTTNG_HEALTH_HT_CLEANUP:
+                       reply.ret_code = health_check_state(HEALTH_TYPE_HT_CLEANUP);
+                       break;
                case LTTNG_HEALTH_ALL:
                        reply.ret_code =
                                health_check_state(HEALTH_TYPE_APP_MANAGE) &&
                                health_check_state(HEALTH_TYPE_APP_REG) &&
                                health_check_state(HEALTH_TYPE_CMD) &&
                                health_check_state(HEALTH_TYPE_KERNEL) &&
-                               check_consumer_health();
+                               check_consumer_health() &&
+                               health_check_state(HEALTH_TYPE_HT_CLEANUP);
                        break;
                default:
                        reply.ret_code = LTTNG_ERR_UND;
@@ -4293,6 +4298,11 @@ int main(int argc, char **argv)
                }
        }
 
+       /* Setup the thread ht_cleanup communication pipe. */
+       if (utils_create_pipe_cloexec(ht_cleanup_pipe) < 0) {
+               goto exit;
+       }
+
        /* Setup the thread apps communication pipe. */
        if ((ret = utils_create_pipe_cloexec(apps_cmd_pipe)) < 0) {
                goto exit;
@@ -4331,6 +4341,14 @@ int main(int argc, char **argv)
 
        write_pidfile();
 
+       /* Create thread to manage the client socket */
+       ret = pthread_create(&ht_cleanup_thread, NULL,
+                       thread_ht_cleanup, (void *) NULL);
+       if (ret != 0) {
+               PERROR("pthread_create ht_cleanup");
+               goto exit_ht_cleanup;
+       }
+
        /* Create thread to manage the client socket */
        ret = pthread_create(&health_thread, NULL,
                        thread_manage_health, (void *) NULL);
@@ -4450,6 +4468,12 @@ exit_client:
        }
 
 exit_health:
+       ret = pthread_join(ht_cleanup_thread, &status);
+       if (ret != 0) {
+               PERROR("pthread_join ht cleanup thread");
+               goto error;     /* join error, exit without cleanup */
+       }
+exit_ht_cleanup:
 exit:
        /*
         * cleanup() is called when no other thread is running.
index 8fb685c8d2ab62d4123997fef2be46510e448ebf..51632bad4ed602bcbc52fc769cc346bb709c74b3 100644 (file)
@@ -26,6 +26,7 @@
 
 #include "buffer-registry.h"
 #include "trace-ust.h"
+#include "utils.h"
 
 /*
  * Match function for the events hash table lookup.
@@ -228,7 +229,7 @@ struct ltt_ust_session *trace_ust_create_session(unsigned int session_id)
        return lus;
 
 error_consumer:
-       lttng_ht_destroy(lus->domain_global.channels);
+       ht_cleanup_push(lus->domain_global.channels);
        free(lus);
 error:
        return NULL;
@@ -483,7 +484,7 @@ static void destroy_contexts(struct lttng_ht *ht)
        }
        rcu_read_unlock();
 
-       lttng_ht_destroy(ht);
+       ht_cleanup_push(ht);
 }
 
 /*
@@ -530,7 +531,7 @@ static void destroy_events(struct lttng_ht *events)
        }
        rcu_read_unlock();
 
-       lttng_ht_destroy(events);
+       ht_cleanup_push(events);
 }
 
 /*
@@ -604,7 +605,7 @@ static void destroy_channels(struct lttng_ht *channels)
        }
        rcu_read_unlock();
 
-       lttng_ht_destroy(channels);
+       ht_cleanup_push(channels);
 }
 
 /*
index ee3b3406da781b1d488d17af5218933d2eefd265..37f6442156ff03a8724ef62137e66b1548544f30 100644 (file)
@@ -38,6 +38,7 @@
 #include "ust-app.h"
 #include "ust-consumer.h"
 #include "ust-ctl.h"
+#include "utils.h"
 
 /* Next available channel key. */
 static unsigned long next_channel_key;
@@ -307,9 +308,9 @@ void delete_ust_app_stream(int sock, struct ust_app_stream *stream)
 
 /*
  * We need to execute ht_destroy outside of RCU read-side critical
- * section, so we postpone its execution using call_rcu. It is simpler
- * than to change the semantic of the many callers of
- * delete_ust_app_channel().
+ * section and outside of call_rcu thread, so we postpone its execution
+ * using ht_cleanup_push. It is simpler than to change the semantic of
+ * the many callers of delete_ust_app_session().
  */
 static
 void delete_ust_app_channel_rcu(struct rcu_head *head)
@@ -317,8 +318,8 @@ void delete_ust_app_channel_rcu(struct rcu_head *head)
        struct ust_app_channel *ua_chan =
                caa_container_of(head, struct ust_app_channel, rcu_head);
 
-       lttng_ht_destroy(ua_chan->ctx);
-       lttng_ht_destroy(ua_chan->events);
+       ht_cleanup_push(ua_chan->ctx);
+       ht_cleanup_push(ua_chan->events);
        free(ua_chan);
 }
 
@@ -583,9 +584,9 @@ end:
 
 /*
  * We need to execute ht_destroy outside of RCU read-side critical
- * section, so we postpone its execution using call_rcu. It is simpler
- * than to change the semantic of the many callers of
- * delete_ust_app_session().
+ * section and outside of call_rcu thread, so we postpone its execution
+ * using ht_cleanup_push. It is simpler than to change the semantic of
+ * the many callers of delete_ust_app_session().
  */
 static
 void delete_ust_app_session_rcu(struct rcu_head *head)
@@ -593,7 +594,7 @@ void delete_ust_app_session_rcu(struct rcu_head *head)
        struct ust_app_session *ua_sess =
                caa_container_of(head, struct ust_app_session, rcu_head);
 
-       lttng_ht_destroy(ua_sess->channels);
+       ht_cleanup_push(ua_sess->channels);
        free(ua_sess);
 }
 
@@ -685,8 +686,8 @@ void delete_ust_app(struct ust_app *app)
                rcu_read_unlock();
        }
 
-       lttng_ht_destroy(app->sessions);
-       lttng_ht_destroy(app->ust_objd);
+       ht_cleanup_push(app->sessions);
+       ht_cleanup_push(app->ust_objd);
 
        /*
         * Wait until we have deleted the application from the sock hash table
@@ -3155,9 +3156,9 @@ void ust_app_clean_list(void)
        rcu_read_unlock();
 
        /* Destroy is done only when the ht is empty */
-       lttng_ht_destroy(ust_app_ht);
-       lttng_ht_destroy(ust_app_ht_by_sock);
-       lttng_ht_destroy(ust_app_ht_by_notify_sock);
+       ht_cleanup_push(ust_app_ht);
+       ht_cleanup_push(ust_app_ht_by_sock);
+       ht_cleanup_push(ust_app_ht_by_notify_sock);
 }
 
 /*
index 399d48f681d8e0aa22bd269109e580f0e2206d77..6e9f0699b334bbeebb00957b0e9a9b0bd486ea1b 100644 (file)
@@ -23,6 +23,7 @@
 #include <lttng/lttng.h>
 
 #include "ust-registry.h"
+#include "utils.h"
 
 /*
  * Hash table match function for event in the registry.
@@ -310,9 +311,9 @@ void ust_registry_destroy_event(struct ust_registry_channel *chan,
 
 /*
  * We need to execute ht_destroy outside of RCU read-side critical
- * section, so we postpone its execution using call_rcu. It is simpler
- * than to change the semantic of the many callers of
- * destroy_channel().
+ * section and outside of call_rcu thread, so we postpone its execution
+ * using ht_cleanup_push. It is simpler than to change the semantic of
+ * the many callers of delete_ust_app_session().
  */
 static
 void destroy_channel_rcu(struct rcu_head *head)
@@ -321,7 +322,7 @@ void destroy_channel_rcu(struct rcu_head *head)
                caa_container_of(head, struct ust_registry_channel, rcu_head);
 
        if (chan->ht) {
-               lttng_ht_destroy(chan->ht);
+               ht_cleanup_push(chan->ht);
        }
        free(chan);
 }
@@ -552,7 +553,7 @@ void ust_registry_session_destroy(struct ust_registry_session *reg)
        rcu_read_unlock();
 
        if (reg->channels) {
-               lttng_ht_destroy(reg->channels);
+               ht_cleanup_push(reg->channels);
        }
        free(reg->metadata);
 }
index 5ea337450ecf63e8fc1f19b6e7e336804e869f68..358e407e9c326cf038024b570f4c9209a724e2ca 100644 (file)
@@ -23,6 +23,9 @@
 #include <common/error.h>
 
 #include "utils.h"
+#include "lttng-sessiond.h"
+
+int ht_cleanup_pipe[2] = { -1, -1 };
 
 /*
  * Write to writable pipe used to notify a thread.
@@ -55,3 +58,27 @@ const char *get_home_dir(void)
 {
        return ((const char *) getenv("HOME"));
 }
+
+void ht_cleanup_push(struct lttng_ht *ht)
+{
+       int ret;
+       int fd = ht_cleanup_pipe[1];
+
+       if (fd < 0)
+               return;
+       do {
+               ret = write(fd, &ht, sizeof(ht));
+       } while (ret < 0 && errno == EINTR);
+       if (ret < 0 || ret != sizeof(ht)) {
+               PERROR("write ht cleanup pipe %d", fd);
+               if (ret < 0) {
+                       ret = -errno;
+               }
+               goto error;
+       }
+
+       /* All good. Don't send back the write positive ret value. */
+       ret = 0;
+error:
+       assert(!ret);
+}
index d4bd8c286f9df4b0f577029677d2a92d150a5f13..7c06d5b03233e03320c6ab185fac0b495c44e579 100644 (file)
 #ifndef _LTT_UTILS_H
 #define _LTT_UTILS_H
 
+struct lttng_ht;
+
 const char *get_home_dir(void);
 int notify_thread_pipe(int wpipe);
+void ht_cleanup_push(struct lttng_ht *ht);
 
 #endif /* _LTT_UTILS_H */
index e8a6429d8f8446887df9ca62642e600a6593b396..247178177c180a0c8dec7dff20356f0ce5006e99 100644 (file)
@@ -28,6 +28,7 @@ test_uri_LDADD = $(LIBTAP) $(LIBCOMMON) $(LIBHASHTABLE)
 # Session unit test
 SESSIONS=$(top_srcdir)/src/bin/lttng-sessiond/session.o        \
         $(top_srcdir)/src/bin/lttng-sessiond/consumer.o \
+        $(top_srcdir)/src/bin/lttng-sessiond/utils.o \
         $(top_srcdir)/src/bin/lttng-sessiond/health.o \
         $(top_srcdir)/src/common/uri.o \
         $(top_srcdir)/src/common/utils.o \
@@ -42,6 +43,7 @@ test_session_LDADD += $(SESSIONS)
 if HAVE_LIBLTTNG_UST_CTL
 UST_DATA_TRACE=$(top_srcdir)/src/bin/lttng-sessiond/trace-ust.o \
               $(top_srcdir)/src/bin/lttng-sessiond/consumer.o \
+              $(top_srcdir)/src/bin/lttng-sessiond/utils.o \
                   $(top_srcdir)/src/bin/lttng-sessiond/buffer-registry.o \
                   $(top_srcdir)/src/bin/lttng-sessiond/ust-registry.o \
                   $(top_srcdir)/src/bin/lttng-sessiond/ust-metadata.o \
@@ -63,6 +65,7 @@ endif
 KERN_DATA_TRACE=$(top_srcdir)/src/bin/lttng-sessiond/trace-kernel.o    \
                $(top_srcdir)/src/bin/lttng-sessiond/consumer.o \
                $(top_srcdir)/src/bin/lttng-sessiond/health.o \
+               $(top_srcdir)/src/bin/lttng-sessiond/utils.o \
                $(top_srcdir)/src/common/uri.o \
                $(top_srcdir)/src/common/utils.o
 
This page took 0.035928 seconds and 4 git commands to generate.