Fix: race with the viewer and readiness of streams
[lttng-tools.git] / src / common / ust-consumer / ust-consumer.c
index 6a508143e0d7ab22796e3e9ad6fcf233258ae2bc..af3aca0a5dfad074ba163a875770244af13e475b 100644 (file)
@@ -32,6 +32,7 @@
 #include <urcu/list.h>
 #include <signal.h>
 
+#include <bin/lttng-consumerd/health-consumerd.h>
 #include <common/common.h>
 #include <common/sessiond-comm/sessiond-comm.h>
 #include <common/relayd/relayd.h>
@@ -63,6 +64,9 @@ static void destroy_channel(struct lttng_consumer_channel *channel)
 
        cds_list_for_each_entry_safe(stream, stmp, &channel->streams.head,
                        send_node) {
+
+               health_code_update();
+
                cds_list_del(&stream->send_node);
                ustctl_destroy_stream(stream->ustream);
                free(stream);
@@ -257,6 +261,8 @@ static int create_ust_streams(struct lttng_consumer_channel *channel,
                int wait_fd;
                int ust_metadata_pipe[2];
 
+               health_code_update();
+
                if (channel->type == CONSUMER_CHANNEL_TYPE_METADATA && channel->monitor) {
                        ret = utils_create_pipe_cloexec_nonblock(ust_metadata_pipe);
                        if (ret < 0) {
@@ -401,8 +407,9 @@ static int send_sessiond_channel(int sock,
                struct lttng_consumer_channel *channel,
                struct lttng_consumer_local_data *ctx, int *relayd_error)
 {
-       int ret, ret_code = LTTNG_OK;
+       int ret, ret_code = LTTCOMM_CONSUMERD_SUCCESS;
        struct lttng_consumer_stream *stream;
+       uint64_t net_seq_idx = -1ULL;
 
        assert(channel);
        assert(ctx);
@@ -412,6 +419,9 @@ static int send_sessiond_channel(int sock,
 
        if (channel->relayd_id != (uint64_t) -1ULL) {
                cds_list_for_each_entry(stream, &channel->streams.head, send_node) {
+
+                       health_code_update();
+
                        /* Try to send the stream to the relayd if one is available. */
                        ret = consumer_send_relayd_stream(stream, stream->chan->pathname);
                        if (ret < 0) {
@@ -424,12 +434,26 @@ static int send_sessiond_channel(int sock,
                                }
                                ret_code = LTTNG_ERR_RELAYD_CONNECT_FAIL;
                        }
+                       if (net_seq_idx == -1ULL) {
+                               net_seq_idx = stream->net_seq_idx;
+                       }
+               }
+               ret = consumer_send_relayd_streams_sent(net_seq_idx);
+               if (ret < 0) {
+                       /*
+                        * Flag that the relayd was the problem here probably due to a
+                        * communicaton error on the socket.
+                        */
+                       if (relayd_error) {
+                               *relayd_error = 1;
+                       }
+                       ret_code = LTTNG_ERR_RELAYD_CONNECT_FAIL;
                }
        }
 
        /* Inform sessiond that we are about to send channel and streams. */
        ret = consumer_send_status_msg(sock, ret_code);
-       if (ret < 0 || ret_code != LTTNG_OK) {
+       if (ret < 0 || ret_code != LTTCOMM_CONSUMERD_SUCCESS) {
                /*
                 * Either the session daemon is not responding or the relayd died so we
                 * stop now.
@@ -450,6 +474,9 @@ static int send_sessiond_channel(int sock,
 
        /* The channel was sent successfully to the sessiond at this point. */
        cds_list_for_each_entry(stream, &channel->streams.head, send_node) {
+
+               health_code_update();
+
                /* Send stream to session daemon. */
                ret = send_sessiond_stream(sock, stream);
                if (ret < 0) {
@@ -468,7 +495,7 @@ static int send_sessiond_channel(int sock,
        return 0;
 
 error:
-       if (ret_code != LTTNG_OK) {
+       if (ret_code != LTTCOMM_CONSUMERD_SUCCESS) {
                ret = -1;
        }
        return ret;
@@ -551,6 +578,9 @@ static int send_streams_to_thread(struct lttng_consumer_channel *channel,
        /* Send streams to the corresponding thread. */
        cds_list_for_each_entry_safe(stream, stmp, &channel->streams.head,
                        send_node) {
+
+               health_code_update();
+
                /* Sending the stream to the thread. */
                ret = send_stream_to_thread(stream, ctx);
                if (ret < 0) {
@@ -601,6 +631,9 @@ static int flush_channel(uint64_t chan_key)
        cds_lfht_for_each_entry_duplicate(ht->ht,
                        ht->hash_fct(&channel->key, lttng_ht_seed), ht->match_fct,
                        &channel->key, &iter.iter, stream, node_channel_id.node) {
+
+               health_code_update();
+
                ustctl_flush_buffer(stream->ustream, 1);
        }
 error:
@@ -617,7 +650,7 @@ error:
  */
 static int _close_metadata(struct lttng_consumer_channel *channel)
 {
-       int ret = LTTNG_OK;
+       int ret = LTTCOMM_CONSUMERD_SUCCESS;
 
        assert(channel);
        assert(channel->type == CONSUMER_CHANNEL_TYPE_METADATA);
@@ -796,6 +829,8 @@ static int snapshot_metadata(uint64_t key, char *path, uint64_t relayd_id,
        }
        assert(!metadata_channel->monitor);
 
+       health_code_update();
+
        /*
         * Ask the sessiond if we have new metadata waiting and update the
         * consumer metadata cache.
@@ -805,6 +840,8 @@ static int snapshot_metadata(uint64_t key, char *path, uint64_t relayd_id,
                goto error;
        }
 
+       health_code_update();
+
        /*
         * The metadata stream is NOT created in no monitor mode when the channel
         * is created on a sessiond ask channel command.
@@ -836,6 +873,8 @@ static int snapshot_metadata(uint64_t key, char *path, uint64_t relayd_id,
        }
 
        do {
+               health_code_update();
+
                ret = lttng_consumer_read_subbuffer(metadata_stream, ctx);
                if (ret < 0) {
                        goto error_stream;
@@ -889,6 +928,9 @@ static int snapshot_channel(uint64_t key, char *path, uint64_t relayd_id,
        DBG("UST consumer snapshot channel %" PRIu64, key);
 
        cds_list_for_each_entry(stream, &channel->streams.head, send_node) {
+
+               health_code_update();
+
                /* Lock stream because we are about to change its state. */
                pthread_mutex_lock(&stream->lock);
                stream->net_seq_idx = relayd_id;
@@ -912,6 +954,12 @@ static int snapshot_channel(uint64_t key, char *path, uint64_t relayd_id,
                        DBG("UST consumer snapshot stream %s/%s (%" PRIu64 ")", path,
                                        stream->name, stream->key);
                }
+               if (relayd_id != -1ULL) {
+                       ret = consumer_send_relayd_streams_sent(relayd_id);
+                       if (ret < 0) {
+                               goto error_unlock;
+                       }
+               }
 
                ustctl_flush_buffer(stream->ustream, 1);
 
@@ -946,6 +994,8 @@ static int snapshot_channel(uint64_t key, char *path, uint64_t relayd_id,
                        ssize_t read_len;
                        unsigned long len, padded_len;
 
+                       health_code_update();
+
                        DBG("UST consumer taking snapshot at pos %lu", consumed_pos);
 
                        ret = ustctl_get_subbuf(stream->ustream, &consumed_pos);
@@ -1021,7 +1071,7 @@ int lttng_ustconsumer_recv_metadata(int sock, uint64_t key, uint64_t offset,
                uint64_t len, struct lttng_consumer_channel *channel,
                int timer, int wait)
 {
-       int ret, ret_code = LTTNG_OK;
+       int ret, ret_code = LTTCOMM_CONSUMERD_SUCCESS;
        char *metadata_str;
 
        DBG("UST consumer push metadata key %" PRIu64 " of len %" PRIu64, key, len);
@@ -1033,6 +1083,8 @@ int lttng_ustconsumer_recv_metadata(int sock, uint64_t key, uint64_t offset,
                goto end;
        }
 
+       health_code_update();
+
        /* Receive metadata string. */
        ret = lttcomm_recv_unix_sock(sock, metadata_str, len);
        if (ret < 0) {
@@ -1041,6 +1093,8 @@ int lttng_ustconsumer_recv_metadata(int sock, uint64_t key, uint64_t offset,
                goto end_free;
        }
 
+       health_code_update();
+
        pthread_mutex_lock(&channel->metadata_cache->lock);
        ret = consumer_metadata_cache_write(channel, offset, len, metadata_str);
        if (ret < 0) {
@@ -1061,6 +1115,9 @@ int lttng_ustconsumer_recv_metadata(int sock, uint64_t key, uint64_t offset,
        }
        while (consumer_metadata_cache_flushed(channel, offset + len, timer)) {
                DBG("Waiting for metadata to be flushed");
+
+               health_code_update();
+
                usleep(DEFAULT_METADATA_AVAILABILITY_WAIT_TIME);
        }
 
@@ -1079,10 +1136,12 @@ int lttng_ustconsumer_recv_cmd(struct lttng_consumer_local_data *ctx,
                int sock, struct pollfd *consumer_sockpoll)
 {
        ssize_t ret;
-       enum lttng_error_code ret_code = LTTNG_OK;
+       enum lttcomm_return_code ret_code = LTTCOMM_CONSUMERD_SUCCESS;
        struct lttcomm_consumer_msg msg;
        struct lttng_consumer_channel *channel = NULL;
 
+       health_code_update();
+
        ret = lttcomm_recv_unix_sock(sock, &msg, sizeof(msg));
        if (ret != sizeof(msg)) {
                DBG("Consumer received unexpected message size %zd (expects %zu)",
@@ -1097,6 +1156,9 @@ int lttng_ustconsumer_recv_cmd(struct lttng_consumer_local_data *ctx,
                }
                return ret;
        }
+
+       health_code_update();
+
        if (msg.cmd_type == LTTNG_CONSUMER_STOP) {
                /*
                 * Notify the session daemon that the command is completed.
@@ -1109,6 +1171,8 @@ int lttng_ustconsumer_recv_cmd(struct lttng_consumer_local_data *ctx,
                return -ENOENT;
        }
 
+       health_code_update();
+
        /* relayd needs RCU read-side lock */
        rcu_read_lock();
 
@@ -1214,9 +1278,16 @@ int lttng_ustconsumer_recv_cmd(struct lttng_consumer_local_data *ctx,
                attr.switch_timer_interval = msg.u.ask_channel.switch_timer_interval;
                attr.read_timer_interval = msg.u.ask_channel.read_timer_interval;
                attr.chan_id = msg.u.ask_channel.chan_id;
-               attr.output = msg.u.ask_channel.output;
                memcpy(attr.uuid, msg.u.ask_channel.uuid, sizeof(attr.uuid));
 
+               /* Match channel buffer type to the UST abi. */
+               switch (msg.u.ask_channel.output) {
+               case LTTNG_EVENT_MMAP:
+               default:
+                       attr.output = LTTNG_UST_MMAP;
+                       break;
+               }
+
                /* Translate and save channel type. */
                switch (msg.u.ask_channel.type) {
                case LTTNG_UST_CHAN_PER_CPU:
@@ -1238,6 +1309,8 @@ int lttng_ustconsumer_recv_cmd(struct lttng_consumer_local_data *ctx,
                        goto error_fatal;
                };
 
+               health_code_update();
+
                ret = ask_channel(ctx, sock, channel, &attr);
                if (ret < 0) {
                        goto end_channel_error;
@@ -1256,6 +1329,8 @@ int lttng_ustconsumer_recv_cmd(struct lttng_consumer_local_data *ctx,
                                        msg.u.ask_channel.live_timer_interval);
                }
 
+               health_code_update();
+
                /*
                 * Add the channel to the internal state AFTER all streams were created
                 * and successfully sent to session daemon. This way, all streams must
@@ -1277,6 +1352,8 @@ int lttng_ustconsumer_recv_cmd(struct lttng_consumer_local_data *ctx,
                        goto end_channel_error;
                }
 
+               health_code_update();
+
                /*
                 * Channel and streams are now created. Inform the session daemon that
                 * everything went well and should wait to receive the channel and
@@ -1305,6 +1382,8 @@ int lttng_ustconsumer_recv_cmd(struct lttng_consumer_local_data *ctx,
                        goto end_msg_sessiond;
                }
 
+               health_code_update();
+
                /* Send everything to sessiond. */
                ret = send_sessiond_channel(sock, channel, ctx, &relayd_err);
                if (ret < 0) {
@@ -1324,6 +1403,8 @@ int lttng_ustconsumer_recv_cmd(struct lttng_consumer_local_data *ctx,
                        goto error_fatal;
                }
 
+               health_code_update();
+
                /*
                 * In no monitor mode, the streams ownership is kept inside the channel
                 * so don't send them to the data thread.
@@ -1391,23 +1472,39 @@ int lttng_ustconsumer_recv_cmd(struct lttng_consumer_local_data *ctx,
 
                channel = consumer_find_channel(key);
                if (!channel) {
-                       ERR("UST consumer push metadata %" PRIu64 " not found", key);
-                       ret_code = LTTNG_ERR_UST_CHAN_NOT_FOUND;
+                       /*
+                        * This is possible if the metadata creation on the consumer side
+                        * is in flight vis-a-vis a concurrent push metadata from the
+                        * session daemon.  Simply return that the channel failed and the
+                        * session daemon will handle that message correctly considering
+                        * that this race is acceptable thus the DBG() statement here.
+                        */
+                       DBG("UST consumer push metadata %" PRIu64 " not found", key);
+                       ret_code = LTTCOMM_CONSUMERD_CHANNEL_FAIL;
                        goto end_msg_sessiond;
                }
 
+               health_code_update();
+
                /* Tell session daemon we are ready to receive the metadata. */
-               ret = consumer_send_status_msg(sock, LTTNG_OK);
+               ret = consumer_send_status_msg(sock, LTTCOMM_CONSUMERD_SUCCESS);
                if (ret < 0) {
                        /* Somehow, the session daemon is not responding anymore. */
                        goto error_fatal;
                }
 
+               health_code_update();
+
                /* Wait for more data. */
-               if (lttng_consumer_poll_socket(consumer_sockpoll) < 0) {
+               health_poll_entry();
+               ret = lttng_consumer_poll_socket(consumer_sockpoll);
+               health_poll_exit();
+               if (ret < 0) {
                        goto error_fatal;
                }
 
+               health_code_update();
+
                ret = lttng_ustconsumer_recv_metadata(sock, key, offset,
                                len, channel, 0, 1);
                if (ret < 0) {
@@ -1451,11 +1548,13 @@ int lttng_ustconsumer_recv_cmd(struct lttng_consumer_local_data *ctx,
                        }
                }
 
+               health_code_update();
                ret = consumer_send_status_msg(sock, ret_code);
                if (ret < 0) {
                        /* Somehow, the session daemon is not responding anymore. */
                        goto end_nosignal;
                }
+               health_code_update();
                break;
        }
        default:
@@ -1465,6 +1564,8 @@ int lttng_ustconsumer_recv_cmd(struct lttng_consumer_local_data *ctx,
 end_nosignal:
        rcu_read_unlock();
 
+       health_code_update();
+
        /*
         * Return 1 to indicate success since the 0 value can be a socket
         * shutdown during the recv() or send() call.
@@ -1482,6 +1583,9 @@ end_msg_sessiond:
                goto error_fatal;
        }
        rcu_read_unlock();
+
+       health_code_update();
+
        return 1;
 end_channel_error:
        if (channel) {
@@ -1498,6 +1602,9 @@ end_channel_error:
                goto error_fatal;
        }
        rcu_read_unlock();
+
+       health_code_update();
+
        return 1;
 error_fatal:
        rcu_read_unlock();
@@ -1632,7 +1739,7 @@ void lttng_ustconsumer_del_stream(struct lttng_consumer_stream *stream)
  *
  * Return 0 on success or else a negative value.
  */
-static int get_index_values(struct lttng_packet_index *index,
+static int get_index_values(struct ctf_packet_index *index,
                struct ustctl_consumer_stream *ustream)
 {
        int ret;
@@ -1797,7 +1904,7 @@ int lttng_ustconsumer_read_subbuffer(struct lttng_consumer_stream *stream,
        long ret = 0;
        char dummy;
        struct ustctl_consumer_stream *ustream;
-       struct lttng_packet_index index;
+       struct ctf_packet_index index;
 
        assert(stream);
        assert(stream->ustream);
@@ -1809,14 +1916,16 @@ int lttng_ustconsumer_read_subbuffer(struct lttng_consumer_stream *stream,
        /* Ease our life for what's next. */
        ustream = stream->ustream;
 
-       /* We can consume the 1 byte written into the wait_fd by UST */
+       /*
+        * We can consume the 1 byte written into the wait_fd by UST.
+        * Don't trigger error if we cannot read this one byte (read
+        * returns 0), or if the error is EAGAIN or EWOULDBLOCK.
+        */
        if (stream->monitor && !stream->hangup_flush_done) {
                ssize_t readlen;
 
-               do {
-                       readlen = read(stream->wait_fd, &dummy, 1);
-               } while (readlen == -1 && errno == EINTR);
-               if (readlen == -1 && errno != EAGAIN && errno != EWOULDBLOCK) {
+               readlen = lttng_read(stream->wait_fd, &dummy, 1);
+               if (readlen < 0 && errno != EAGAIN && errno != EWOULDBLOCK) {
                        ret = readlen;
                        goto end;
                }
@@ -2052,6 +2161,9 @@ void lttng_ustconsumer_close_metadata(struct lttng_ht *metadata_ht)
        rcu_read_lock();
        cds_lfht_for_each_entry(metadata_ht->ht, &iter.iter, stream,
                        node.node) {
+
+               health_code_update();
+
                pthread_mutex_lock(&stream->chan->lock);
                /*
                 * Whatever returned value, we must continue to try to close everything
@@ -2087,7 +2199,7 @@ int lttng_ustconsumer_request_metadata(struct lttng_consumer_local_data *ctx,
 {
        struct lttcomm_metadata_request_msg request;
        struct lttcomm_consumer_msg msg;
-       enum lttng_error_code ret_code = LTTNG_OK;
+       enum lttcomm_return_code ret_code = LTTCOMM_CONSUMERD_SUCCESS;
        uint64_t len, key, offset;
        int ret;
 
@@ -2123,6 +2235,9 @@ int lttng_ustconsumer_request_metadata(struct lttng_consumer_local_data *ctx,
                        request.key);
 
        pthread_mutex_lock(&ctx->metadata_socket_lock);
+
+       health_code_update();
+
        ret = lttcomm_send_unix_sock(ctx->consumer_metadata_socket, &request,
                        sizeof(request));
        if (ret < 0) {
@@ -2130,6 +2245,8 @@ int lttng_ustconsumer_request_metadata(struct lttng_consumer_local_data *ctx,
                goto end;
        }
 
+       health_code_update();
+
        /* Receive the metadata from sessiond */
        ret = lttcomm_recv_unix_sock(ctx->consumer_metadata_socket, &msg,
                        sizeof(msg));
@@ -2144,6 +2261,8 @@ int lttng_ustconsumer_request_metadata(struct lttng_consumer_local_data *ctx,
                goto end;
        }
 
+       health_code_update();
+
        if (msg.cmd_type == LTTNG_ERR_UND) {
                /* No registry found */
                (void) consumer_send_status_msg(ctx->consumer_metadata_socket,
@@ -2165,9 +2284,11 @@ int lttng_ustconsumer_request_metadata(struct lttng_consumer_local_data *ctx,
                DBG("No new metadata to receive for key %" PRIu64, key);
        }
 
+       health_code_update();
+
        /* Tell session daemon we are ready to receive the metadata. */
        ret = consumer_send_status_msg(ctx->consumer_metadata_socket,
-                       LTTNG_OK);
+                       LTTCOMM_CONSUMERD_SUCCESS);
        if (ret < 0 || len == 0) {
                /*
                 * Somehow, the session daemon is not responding anymore or there is
@@ -2176,18 +2297,22 @@ int lttng_ustconsumer_request_metadata(struct lttng_consumer_local_data *ctx,
                goto end;
        }
 
-       ret_code = lttng_ustconsumer_recv_metadata(ctx->consumer_metadata_socket,
+       health_code_update();
+
+       ret = lttng_ustconsumer_recv_metadata(ctx->consumer_metadata_socket,
                        key, offset, len, channel, timer, wait);
-       if (ret_code >= 0) {
+       if (ret >= 0) {
                /*
                 * Only send the status msg if the sessiond is alive meaning a positive
                 * ret code.
                 */
-               (void) consumer_send_status_msg(ctx->consumer_metadata_socket, ret_code);
+               (void) consumer_send_status_msg(ctx->consumer_metadata_socket, ret);
        }
        ret = 0;
 
 end:
+       health_code_update();
+
        pthread_mutex_unlock(&ctx->metadata_socket_lock);
        return ret;
 }
This page took 0.030247 seconds and 4 git commands to generate.