Fix deadlock: don't take channel lock in timer
[lttng-tools.git] / src / common / consumer-timer.c
index ef056d1188d69571f6131c89c34cf3d8f622d2a9..37c9861b906a0a81c6b08a302c73e486e6c6d3dd 100644 (file)
 #include "consumer-timer.h"
 #include "ust-consumer/ust-consumer.h"
 
-static struct timer_signal_data timer_signal;
+static struct timer_signal_data timer_signal = {
+       .tid = 0,
+       .setup_done = 0,
+       .qs_done = 0,
+       .lock = PTHREAD_MUTEX_INITIALIZER,
+};
 
 /*
  * Set custom signal mask to current thread.
@@ -51,6 +56,10 @@ static void setmask(sigset_t *mask)
 
 /*
  * Execute action on a timer switch.
+ *
+ * Beware: metadata_switch_timer() should *never* take a mutex also held
+ * while consumer_timer_switch_stop() is called. It would result in
+ * deadlocks.
  */
 static void metadata_switch_timer(struct lttng_consumer_local_data *ctx,
                int sig, siginfo_t *si, void *uc)
@@ -61,17 +70,31 @@ static void metadata_switch_timer(struct lttng_consumer_local_data *ctx,
        channel = si->si_value.sival_ptr;
        assert(channel);
 
+       if (channel->switch_timer_error) {
+               return;
+       }
+
        DBG("Switch timer for channel %" PRIu64, channel->key);
        switch (ctx->type) {
        case LTTNG_CONSUMER32_UST:
        case LTTNG_CONSUMER64_UST:
-               ret = lttng_ustconsumer_request_metadata(ctx, channel);
+               /*
+                * Locks taken by lttng_ustconsumer_request_metadata():
+                * - metadata_socket_lock
+                *   - Calling lttng_ustconsumer_recv_metadata():
+                *     - channel->metadata_cache->lock
+                *     - Calling consumer_metadata_cache_flushed():
+                *       - channel->timer_lock
+                *         - channel->metadata_cache->lock
+                *
+                * Ensure that neither consumer_data.lock nor
+                * channel->lock are taken within this function, since
+                * they are held while consumer_timer_switch_stop() is
+                * called.
+                */
+               ret = lttng_ustconsumer_request_metadata(ctx, channel, 1);
                if (ret < 0) {
-                       /*
-                        * An error means that we were unable to request the metadata to
-                        * the session daemon so stop the timer for that channel.
-                        */
-                       consumer_timer_switch_stop(channel);
+                       channel->switch_timer_error = 1;
                }
                break;
        case LTTNG_CONSUMER_KERNEL:
@@ -81,6 +104,57 @@ static void metadata_switch_timer(struct lttng_consumer_local_data *ctx,
        }
 }
 
+static
+void consumer_timer_signal_thread_qs(unsigned int signr)
+{
+       sigset_t pending_set;
+       int ret;
+
+       /*
+        * We need to be the only thread interacting with the thread
+        * that manages signals for teardown synchronization.
+        */
+       pthread_mutex_lock(&timer_signal.lock);
+
+       /* Ensure we don't have any signal queued for this channel. */
+       for (;;) {
+               ret = sigemptyset(&pending_set);
+               if (ret == -1) {
+                       PERROR("sigemptyset");
+               }
+               ret = sigpending(&pending_set);
+               if (ret == -1) {
+                       PERROR("sigpending");
+               }
+               if (!sigismember(&pending_set, LTTNG_CONSUMER_SIG_SWITCH)) {
+                       break;
+               }
+               caa_cpu_relax();
+       }
+
+       /*
+        * From this point, no new signal handler will be fired that would try to
+        * access "chan". However, we still need to wait for any currently
+        * executing handler to complete.
+        */
+       cmm_smp_mb();
+       CMM_STORE_SHARED(timer_signal.qs_done, 0);
+       cmm_smp_mb();
+
+       /*
+        * Kill with LTTNG_CONSUMER_SIG_TEARDOWN, so signal management thread wakes
+        * up.
+        */
+       kill(getpid(), LTTNG_CONSUMER_SIG_TEARDOWN);
+
+       while (!CMM_LOAD_SHARED(timer_signal.qs_done)) {
+               caa_cpu_relax();
+       }
+       cmm_smp_mb();
+
+       pthread_mutex_unlock(&timer_signal.lock);
+}
+
 /*
  * Set the timer for periodical metadata flush.
  */
@@ -124,7 +198,6 @@ void consumer_timer_switch_start(struct lttng_consumer_channel *channel,
 void consumer_timer_switch_stop(struct lttng_consumer_channel *channel)
 {
        int ret;
-       sigset_t pending_set;
 
        assert(channel);
 
@@ -133,41 +206,10 @@ void consumer_timer_switch_stop(struct lttng_consumer_channel *channel)
                PERROR("timer_delete");
        }
 
-       /* Ensure we don't have any signal queued for this channel. */
-       for (;;) {
-               ret = sigemptyset(&pending_set);
-               if (ret == -1) {
-                       PERROR("sigemptyset");
-               }
-               ret = sigpending(&pending_set);
-               if (ret == -1) {
-                       PERROR("sigpending");
-               }
-               if (!sigismember(&pending_set, LTTNG_CONSUMER_SIG_SWITCH)) {
-                       break;
-               }
-               caa_cpu_relax();
-       }
-
-       /*
-        * From this point, no new signal handler will be fired that would try to
-        * access "chan". However, we still need to wait for any currently
-        * executing handler to complete.
-        */
-       cmm_smp_mb();
-       CMM_STORE_SHARED(timer_signal.qs_done, 0);
-       cmm_smp_mb();
+       consumer_timer_signal_thread_qs(LTTNG_CONSUMER_SIG_SWITCH);
 
-       /*
-        * Kill with LTTNG_CONSUMER_SIG_TEARDOWN, so signal management thread wakes
-        * up.
-        */
-       kill(getpid(), LTTNG_CONSUMER_SIG_TEARDOWN);
-
-       while (!CMM_LOAD_SHARED(timer_signal.qs_done)) {
-               caa_cpu_relax();
-       }
-       cmm_smp_mb();
+       channel->switch_timer = 0;
+       channel->switch_timer_enabled = 0;
 }
 
 /*
This page took 0.025528 seconds and 4 git commands to generate.