relayd: add health check support for live threads
[lttng-tools.git] / src / bin / lttng-relayd / main.c
index ca37b8bc41ed842111ee603ff2860e90b288b758..6727a547de65b2a94706a76b5f2089b5b33956a5 100644 (file)
 #include <common/utils.h>
 
 #include "cmd.h"
+#include "ctf-trace.h"
 #include "index.h"
 #include "utils.h"
 #include "lttng-relayd.h"
+#include "live.h"
+#include "health-relayd.h"
 
 /* command line options */
 char *opt_output_path;
 static int opt_daemon;
 static struct lttng_uri *control_uri;
 static struct lttng_uri *data_uri;
+static struct lttng_uri *live_uri;
 
 const char *progname;
 
@@ -98,13 +102,22 @@ static struct relay_cmd_queue relay_cmd_queue;
 static char *data_buffer;
 static unsigned int data_buffer_size;
 
-/* Global hash table that stores relay index object. */
-static struct lttng_ht *indexes_ht;
-
 /* We need those values for the file/dir creation. */
 static uid_t relayd_uid;
 static gid_t relayd_gid;
 
+/* Global relay stream hash table. */
+struct lttng_ht *relay_streams_ht;
+
+/* Global relay viewer stream hash table. */
+struct lttng_ht *viewer_streams_ht;
+
+/* Global hash table that stores relay index object. */
+struct lttng_ht *indexes_ht;
+
+/* Relayd health monitoring */
+struct health_app *health_relayd;
+
 /*
  * usage function on stderr
  */
@@ -228,6 +241,21 @@ int parse_args(int argc, char **argv)
                        goto exit;
                }
        }
+       if (live_uri == NULL) {
+               ret = asprintf(&default_address, "tcp://0.0.0.0:%d",
+                               DEFAULT_NETWORK_VIEWER_PORT);
+               if (ret < 0) {
+                       PERROR("asprintf default viewer control address");
+                       goto exit;
+               }
+
+               ret = uri_parse(default_address, &live_uri);
+               free(default_address);
+               if (ret < 0) {
+                       ERR("Invalid viewer control URI specified");
+                       goto exit;
+               }
+       }
 
 exit:
        return ret;
@@ -489,6 +517,10 @@ void *relay_thread_listener(void *data)
 
        DBG("[thread] Relay listener started");
 
+       health_register(health_relayd, HEALTH_RELAYD_TYPE_LISTENER);
+
+       health_code_update();
+
        control_sock = relay_init_sock(control_uri);
        if (!control_sock) {
                goto error_sock_control;
@@ -520,10 +552,14 @@ void *relay_thread_listener(void *data)
        }
 
        while (1) {
+               health_code_update();
+
                DBG("Listener accepting connections");
 
 restart:
+               health_poll_entry();
                ret = lttng_poll_wait(&events, -1);
+               health_poll_exit();
                if (ret < 0) {
                        /*
                         * Restart interrupted system call.
@@ -538,6 +574,8 @@ restart:
 
                DBG("Relay new connection received");
                for (i = 0; i < nb_fd; i++) {
+                       health_code_update();
+
                        /* Fetch once the poll data */
                        revents = LTTNG_POLL_GETEV(&events, i);
                        pollfd = LTTNG_POLL_GETFD(&events, i);
@@ -632,8 +670,10 @@ error_sock_relay:
        lttcomm_destroy_sock(control_sock);
 error_sock_control:
        if (err) {
-               DBG("Thread exited with error");
+               health_error();
+               ERR("Health error occurred in %s", __func__);
        }
+       health_unregister(health_relayd);
        DBG("Relay listener thread cleanup complete");
        stop_threads();
        return NULL;
@@ -645,17 +685,25 @@ error_sock_control:
 static
 void *relay_thread_dispatcher(void *data)
 {
-       int ret;
+       int ret, err = -1;
        struct cds_wfq_node *node;
        struct relay_command *relay_cmd = NULL;
 
        DBG("[thread] Relay dispatcher started");
 
+       health_register(health_relayd, HEALTH_RELAYD_TYPE_DISPATCHER);
+
+       health_code_update();
+
        while (!CMM_LOAD_SHARED(dispatch_thread_exit)) {
+               health_code_update();
+
                /* Atomically prepare the queue futex */
                futex_nto1_prepare(&relay_cmd_queue.futex);
 
                do {
+                       health_code_update();
+
                        /* Dequeue commands */
                        node = cds_wfq_dequeue_blocking(&relay_cmd_queue.queue);
                        if (node == NULL) {
@@ -684,10 +732,20 @@ void *relay_thread_dispatcher(void *data)
                } while (node != NULL);
 
                /* Futex wait on queue. Blocking call on futex() */
+               health_poll_entry();
                futex_nto1_wait(&relay_cmd_queue.futex);
+               health_poll_exit();
        }
 
+       /* Normal exit, no error */
+       err = 0;
+
 error:
+       if (err) {
+               health_error();
+               ERR("Health error occurred in %s", __func__);
+       }
+       health_unregister(health_relayd);
        DBG("Dispatch thread dying");
        stop_threads();
        return NULL;
@@ -697,15 +755,13 @@ error:
  * Get stream from stream id.
  * Need to be called with RCU read-side lock held.
  */
-static
-struct relay_stream *relay_stream_from_stream_id(uint64_t stream_id,
-               struct lttng_ht *streams_ht)
+struct relay_stream *relay_stream_find_by_id(uint64_t stream_id)
 {
        struct lttng_ht_node_ulong *node;
        struct lttng_ht_iter iter;
        struct relay_stream *ret;
 
-       lttng_ht_lookup(streams_ht,
+       lttng_ht_lookup(relay_streams_ht,
                        (void *)((unsigned long) stream_id),
                        &iter);
        node = lttng_ht_iter_get_node_ulong(&iter);
@@ -726,17 +782,80 @@ void deferred_free_stream(struct rcu_head *head)
 {
        struct relay_stream *stream =
                caa_container_of(head, struct relay_stream, rcu_node);
+
+       ctf_trace_try_destroy(stream->ctf_trace);
+
        free(stream->path_name);
        free(stream->channel_name);
        free(stream);
 }
 
+static
+void deferred_free_session(struct rcu_head *head)
+{
+       struct relay_session *session =
+               caa_container_of(head, struct relay_session, rcu_node);
+       free(session);
+}
+
+/*
+ * Close a given stream. The stream is freed using a call RCU.
+ *
+ * RCU read side lock MUST be acquired. If NO close_stream_check() was called
+ * BEFORE the stream lock MUST be acquired.
+ */
+static void destroy_stream(struct relay_stream *stream,
+               struct lttng_ht *ctf_traces_ht)
+{
+       int delret;
+       struct relay_viewer_stream *vstream;
+       struct lttng_ht_iter iter;
+
+       assert(stream);
+
+       delret = close(stream->fd);
+       if (delret < 0) {
+               PERROR("close stream");
+       }
+
+       if (stream->index_fd >= 0) {
+               delret = close(stream->index_fd);
+               if (delret < 0) {
+                       PERROR("close stream index_fd");
+               }
+       }
+
+       vstream = live_find_viewer_stream_by_id(stream->stream_handle);
+       if (vstream) {
+               /*
+                * Set the last good value into the viewer stream. This is done
+                * right before the stream gets deleted from the hash table. The
+                * lookup failure on the live thread side of a stream indicates
+                * that the viewer stream index received value should be used.
+                */
+               vstream->total_index_received = stream->total_index_received;
+       }
+
+       /* Cleanup index of that stream. */
+       relay_index_destroy_by_stream_id(stream->stream_handle);
+
+       iter.iter.node = &stream->stream_n.node;
+       delret = lttng_ht_del(relay_streams_ht, &iter);
+       assert(!delret);
+       iter.iter.node = &stream->ctf_trace_node.node;
+       delret = lttng_ht_del(ctf_traces_ht, &iter);
+       assert(!delret);
+       call_rcu(&stream->rcu_node, deferred_free_stream);
+       DBG("Closed tracefile %d from close stream", stream->fd);
+}
+
 /*
  * relay_delete_session: Free all memory associated with a session and
  * close all the FDs
  */
 static
-void relay_delete_session(struct relay_command *cmd, struct lttng_ht *streams_ht)
+void relay_delete_session(struct relay_command *cmd,
+               struct lttng_ht *sessions_ht)
 {
        struct lttng_ht_iter iter;
        struct lttng_ht_node_ulong *node;
@@ -750,29 +869,23 @@ void relay_delete_session(struct relay_command *cmd, struct lttng_ht *streams_ht
        DBG("Relay deleting session %" PRIu64, cmd->session->id);
 
        rcu_read_lock();
-       cds_lfht_for_each_entry(streams_ht->ht, &iter.iter, node, node) {
+       cds_lfht_for_each_entry(relay_streams_ht->ht, &iter.iter, node, node) {
                node = lttng_ht_iter_get_node_ulong(&iter);
-               if (node) {
-                       stream = caa_container_of(node,
-                                       struct relay_stream, stream_n);
-                       if (stream->session == cmd->session) {
-                               ret = close(stream->fd);
-                               if (ret < 0) {
-                                       PERROR("close stream fd on delete session");
-                               }
-                               ret = lttng_ht_del(streams_ht, &iter);
-                               assert(!ret);
-                               call_rcu(&stream->rcu_node,
-                                       deferred_free_stream);
-                       }
-                       /* Cleanup index of that stream. */
-                       relay_index_destroy_by_stream_id(stream->stream_handle,
-                                       indexes_ht);
+               if (!node) {
+                       continue;
+               }
+               stream = caa_container_of(node, struct relay_stream, stream_n);
+               if (stream->session == cmd->session) {
+                       destroy_stream(stream, cmd->ctf_traces_ht);
                }
        }
-       rcu_read_unlock();
 
-       free(cmd->session);
+       /* Make this session not visible anymore. */
+       iter.iter.node = &cmd->session->session_n.node;
+       ret = lttng_ht_del(sessions_ht, &iter);
+       assert(!ret);
+       call_rcu(&cmd->session->rcu_node, deferred_free_session);
+       rcu_read_unlock();
 }
 
 /*
@@ -804,7 +917,8 @@ static void copy_index_control_data(struct relay_index *index,
  */
 static
 int relay_create_session(struct lttcomm_relayd_hdr *recv_hdr,
-               struct relay_command *cmd)
+               struct relay_command *cmd,
+               struct lttng_ht *sessions_ht)
 {
        int ret = 0, send_ret;
        struct relay_session *session;
@@ -824,10 +938,24 @@ int relay_create_session(struct lttcomm_relayd_hdr *recv_hdr,
 
        session->id = ++last_relay_session_id;
        session->sock = cmd->sock;
+       session->minor = cmd->minor;
+       session->major = cmd->major;
        cmd->session = session;
 
        reply.session_id = htobe64(session->id);
 
+       switch (cmd->minor) {
+               case 4: /* LTTng sessiond 2.4 */
+               default:
+                       ret = cmd_create_session_2_4(cmd, session);
+                       break;
+       }
+
+       lttng_ht_node_init_ulong(&session->session_n,
+                       (unsigned long) session->id);
+       lttng_ht_add_unique_ulong(sessions_ht,
+                       &session->session_n);
+
        DBG("Created session %" PRIu64, session->id);
 
 error:
@@ -851,7 +979,7 @@ error:
  */
 static
 int relay_add_stream(struct lttcomm_relayd_hdr *recv_hdr,
-               struct relay_command *cmd, struct lttng_ht *streams_ht)
+               struct relay_command *cmd, struct lttng_ht *sessions_ht)
 {
        struct relay_session *session = cmd->session;
        struct relay_stream *stream = NULL;
@@ -889,6 +1017,9 @@ int relay_add_stream(struct lttcomm_relayd_hdr *recv_hdr,
        stream->prev_seq = -1ULL;
        stream->session = session;
        stream->index_fd = -1;
+       stream->read_index_fd = -1;
+       stream->ctf_trace = NULL;
+       pthread_mutex_init(&stream->lock, NULL);
 
        ret = utils_mkdir_recursive(stream->path_name, S_IRWXU | S_IRWXG);
        if (ret < 0) {
@@ -913,11 +1044,35 @@ int relay_add_stream(struct lttcomm_relayd_hdr *recv_hdr,
                DBG("Tracefile %s/%s created", stream->path_name, stream->channel_name);
        }
 
+       if (!strncmp(stream->channel_name, DEFAULT_METADATA_NAME, NAME_MAX)) {
+               stream->metadata_flag = 1;
+               /*
+                * When we receive a new metadata stream, we create a new
+                * ctf_trace and we assign this ctf_trace to all streams with
+                * the same path.
+                *
+                * If later on we receive a new stream for the same ctf_trace,
+                * we copy the information from the first hit in the HT to the
+                * new stream.
+                */
+               stream->ctf_trace = ctf_trace_create();
+               if (!stream->ctf_trace) {
+                       ret = -1;
+                       goto end;
+               }
+               stream->ctf_trace->refcount++;
+               stream->ctf_trace->metadata_stream = stream;
+       }
+       ctf_trace_assign(cmd->ctf_traces_ht, stream);
+
        lttng_ht_node_init_ulong(&stream->stream_n,
                        (unsigned long) stream->stream_handle);
-       lttng_ht_add_unique_ulong(streams_ht,
+       lttng_ht_add_unique_ulong(relay_streams_ht,
                        &stream->stream_n);
 
+       lttng_ht_node_init_str(&stream->ctf_trace_node, stream->path_name);
+       lttng_ht_add_str(cmd->ctf_traces_ht, &stream->ctf_trace_node);
+
        DBG("Relay new stream added %s with ID %" PRIu64, stream->channel_name,
                        stream->stream_handle);
 
@@ -955,14 +1110,13 @@ err_free_stream:
  */
 static
 int relay_close_stream(struct lttcomm_relayd_hdr *recv_hdr,
-               struct relay_command *cmd, struct lttng_ht *streams_ht)
+               struct relay_command *cmd)
 {
+       int ret, send_ret;
        struct relay_session *session = cmd->session;
        struct lttcomm_relayd_close_stream stream_info;
        struct lttcomm_relayd_generic_reply reply;
        struct relay_stream *stream;
-       int ret, send_ret;
-       struct lttng_ht_iter iter;
 
        DBG("Close stream received");
 
@@ -986,8 +1140,7 @@ int relay_close_stream(struct lttcomm_relayd_hdr *recv_hdr,
        }
 
        rcu_read_lock();
-       stream = relay_stream_from_stream_id(be64toh(stream_info.stream_id),
-                       streams_ht);
+       stream = relay_stream_find_by_id(be64toh(stream_info.stream_id));
        if (!stream) {
                ret = -1;
                goto end_unlock;
@@ -997,25 +1150,7 @@ int relay_close_stream(struct lttcomm_relayd_hdr *recv_hdr,
        stream->close_flag = 1;
 
        if (close_stream_check(stream)) {
-               int delret;
-
-               delret = close(stream->fd);
-               if (delret < 0) {
-                       PERROR("close stream");
-               }
-
-               if (stream->index_fd >= 0) {
-                       delret = close(stream->index_fd);
-                       if (delret < 0) {
-                               PERROR("close stream index_fd");
-                       }
-               }
-               iter.iter.node = &stream->stream_n.node;
-               delret = lttng_ht_del(streams_ht, &iter);
-               assert(!delret);
-               call_rcu(&stream->rcu_node,
-                               deferred_free_stream);
-               DBG("Closed tracefile %d from close stream", stream->fd);
+               destroy_stream(stream, cmd->ctf_traces_ht);
        }
 
 end_unlock:
@@ -1118,7 +1253,7 @@ end:
  */
 static
 int relay_recv_metadata(struct lttcomm_relayd_hdr *recv_hdr,
-               struct relay_command *cmd, struct lttng_ht *streams_ht)
+               struct relay_command *cmd)
 {
        int ret = htobe32(LTTNG_OK);
        struct relay_session *session = cmd->session;
@@ -1170,8 +1305,8 @@ int relay_recv_metadata(struct lttcomm_relayd_hdr *recv_hdr,
        metadata_struct = (struct lttcomm_relayd_metadata_payload *) data_buffer;
 
        rcu_read_lock();
-       metadata_stream = relay_stream_from_stream_id(
-                       be64toh(metadata_struct->stream_id), streams_ht);
+       metadata_stream = relay_stream_find_by_id(
+                       be64toh(metadata_struct->stream_id));
        if (!metadata_stream) {
                ret = -1;
                goto end_unlock;
@@ -1192,6 +1327,8 @@ int relay_recv_metadata(struct lttcomm_relayd_hdr *recv_hdr,
        if (ret < 0) {
                goto end_unlock;
        }
+       metadata_stream->ctf_trace->metadata_received +=
+               payload_size + be32toh(metadata_struct->padding_size);
 
        DBG2("Relay metadata written");
 
@@ -1206,7 +1343,7 @@ end:
  */
 static
 int relay_send_version(struct lttcomm_relayd_hdr *recv_hdr,
-               struct relay_command *cmd, struct lttng_ht *streams_ht)
+               struct relay_command *cmd, struct lttng_ht *sessions_ht)
 {
        int ret;
        struct lttcomm_relayd_version reply, msg;
@@ -1235,7 +1372,7 @@ int relay_send_version(struct lttcomm_relayd_hdr *recv_hdr,
        if (reply.major != be32toh(msg.major)) {
                DBG("Incompatible major versions (%u vs %u), deleting session",
                                reply.major, be32toh(msg.major));
-               relay_delete_session(cmd, streams_ht);
+               relay_delete_session(cmd, sessions_ht);
                ret = 0;
                goto end;
        }
@@ -1268,7 +1405,7 @@ end:
  */
 static
 int relay_data_pending(struct lttcomm_relayd_hdr *recv_hdr,
-               struct relay_command *cmd, struct lttng_ht *streams_ht)
+               struct relay_command *cmd)
 {
        struct relay_session *session = cmd->session;
        struct lttcomm_relayd_data_pending msg;
@@ -1302,7 +1439,7 @@ int relay_data_pending(struct lttcomm_relayd_hdr *recv_hdr,
        last_net_seq_num = be64toh(msg.last_net_seq_num);
 
        rcu_read_lock();
-       stream = relay_stream_from_stream_id(stream_id, streams_ht);
+       stream = relay_stream_find_by_id(stream_id);
        if (stream == NULL) {
                ret = -1;
                goto end_unlock;
@@ -1346,7 +1483,7 @@ end_no_session:
  */
 static
 int relay_quiescent_control(struct lttcomm_relayd_hdr *recv_hdr,
-               struct relay_command *cmd, struct lttng_ht *streams_ht)
+               struct relay_command *cmd)
 {
        int ret;
        uint64_t stream_id;
@@ -1379,7 +1516,8 @@ int relay_quiescent_control(struct lttcomm_relayd_hdr *recv_hdr,
        stream_id = be64toh(msg.stream_id);
 
        rcu_read_lock();
-       cds_lfht_for_each_entry(streams_ht->ht, &iter.iter, stream, stream_n.node) {
+       cds_lfht_for_each_entry(relay_streams_ht->ht, &iter.iter, stream,
+                       stream_n.node) {
                if (stream->stream_handle == stream_id) {
                        stream->data_pending_check_done = 1;
                        DBG("Relay quiescent control pending flag set to %" PRIu64,
@@ -1408,7 +1546,7 @@ end_no_session:
  */
 static
 int relay_begin_data_pending(struct lttcomm_relayd_hdr *recv_hdr,
-               struct relay_command *cmd, struct lttng_ht *streams_ht)
+               struct relay_command *cmd)
 {
        int ret;
        struct lttng_ht_iter iter;
@@ -1419,7 +1557,6 @@ int relay_begin_data_pending(struct lttcomm_relayd_hdr *recv_hdr,
 
        assert(recv_hdr);
        assert(cmd);
-       assert(streams_ht);
 
        DBG("Init streams for data pending");
 
@@ -1450,7 +1587,8 @@ int relay_begin_data_pending(struct lttcomm_relayd_hdr *recv_hdr,
         * streams to find the one associated with the right session_id.
         */
        rcu_read_lock();
-       cds_lfht_for_each_entry(streams_ht->ht, &iter.iter, stream, stream_n.node) {
+       cds_lfht_for_each_entry(relay_streams_ht->ht, &iter.iter, stream,
+                       stream_n.node) {
                if (stream->session->id == session_id) {
                        stream->data_pending_check_done = 0;
                        DBG("Set begin data pending flag to stream %" PRIu64,
@@ -1482,7 +1620,7 @@ end_no_session:
  */
 static
 int relay_end_data_pending(struct lttcomm_relayd_hdr *recv_hdr,
-               struct relay_command *cmd, struct lttng_ht *streams_ht)
+               struct relay_command *cmd)
 {
        int ret;
        struct lttng_ht_iter iter;
@@ -1494,7 +1632,6 @@ int relay_end_data_pending(struct lttcomm_relayd_hdr *recv_hdr,
 
        assert(recv_hdr);
        assert(cmd);
-       assert(streams_ht);
 
        DBG("End data pending command");
 
@@ -1521,7 +1658,8 @@ int relay_end_data_pending(struct lttcomm_relayd_hdr *recv_hdr,
 
        /* Iterate over all streams to see if the begin data pending flag is set. */
        rcu_read_lock();
-       cds_lfht_for_each_entry(streams_ht->ht, &iter.iter, stream, stream_n.node) {
+       cds_lfht_for_each_entry(relay_streams_ht->ht, &iter.iter, stream,
+                       stream_n.node) {
                if (stream->session->id == session_id &&
                                !stream->data_pending_check_done) {
                        is_data_inflight = 1;
@@ -1551,8 +1689,7 @@ end_no_session:
  */
 static
 int relay_recv_index(struct lttcomm_relayd_hdr *recv_hdr,
-               struct relay_command *cmd, struct lttng_ht *streams_ht,
-               struct lttng_ht *indexes_ht)
+               struct relay_command *cmd)
 {
        int ret, send_ret, index_created = 0;
        struct relay_session *session = cmd->session;
@@ -1563,8 +1700,6 @@ int relay_recv_index(struct lttcomm_relayd_hdr *recv_hdr,
        uint64_t net_seq_num;
 
        assert(cmd);
-       assert(streams_ht);
-       assert(indexes_ht);
 
        DBG("Relay receiving index");
 
@@ -1590,14 +1725,29 @@ int relay_recv_index(struct lttcomm_relayd_hdr *recv_hdr,
        net_seq_num = be64toh(index_info.net_seq_num);
 
        rcu_read_lock();
-       stream = relay_stream_from_stream_id(be64toh(index_info.relay_stream_id),
-                       streams_ht);
+       stream = relay_stream_find_by_id(be64toh(index_info.relay_stream_id));
        if (!stream) {
                ret = -1;
                goto end_rcu_unlock;
        }
 
-       index = relay_index_find(stream->stream_handle, net_seq_num, indexes_ht);
+       /* Live beacon handling */
+       if (index_info.packet_size == 0) {
+               DBG("Received live beacon for stream %" PRIu64, stream->stream_handle);
+
+               /*
+                * Only flag a stream inactive when it has already received data.
+                */
+               if (stream->total_index_received > 0) {
+                       stream->beacon_ts_end = be64toh(index_info.timestamp_end);
+               }
+               ret = 0;
+               goto end_rcu_unlock;
+       } else {
+               stream->beacon_ts_end = -1ULL;
+       }
+
+       index = relay_index_find(stream->stream_handle, net_seq_num);
        if (!index) {
                /* A successful creation will add the object to the HT. */
                index = relay_index_create(stream->stream_handle, net_seq_num);
@@ -1615,7 +1765,7 @@ int relay_recv_index(struct lttcomm_relayd_hdr *recv_hdr,
                 * already exist, destroy back the index created, set the data in this
                 * object and write it on disk.
                 */
-               relay_index_add(index, indexes_ht, &wr_index);
+               relay_index_add(index, &wr_index);
                if (wr_index) {
                        copy_index_control_data(wr_index, &index_info);
                        free(index);
@@ -1638,10 +1788,11 @@ int relay_recv_index(struct lttcomm_relayd_hdr *recv_hdr,
                        stream->index_fd = ret;
                }
 
-               ret = relay_index_write(wr_index->fd, wr_index, indexes_ht);
+               ret = relay_index_write(wr_index->fd, wr_index);
                if (ret < 0) {
                        goto end_rcu_unlock;
                }
+               stream->total_index_received++;
        }
 
 end_rcu_unlock:
@@ -1663,49 +1814,47 @@ end_no_session:
 }
 
 /*
- * relay_process_control: Process the commands received on the control socket
+ * Process the commands received on the control socket
  */
 static
 int relay_process_control(struct lttcomm_relayd_hdr *recv_hdr,
-               struct relay_command *cmd, struct lttng_ht *streams_ht,
-               struct lttng_ht *index_streams_ht,
-               struct lttng_ht *indexes_ht)
+               struct relay_command *cmd, struct relay_local_data *ctx)
 {
        int ret = 0;
 
        switch (be32toh(recv_hdr->cmd)) {
        case RELAYD_CREATE_SESSION:
-               ret = relay_create_session(recv_hdr, cmd);
+               ret = relay_create_session(recv_hdr, cmd, ctx->sessions_ht);
                break;
        case RELAYD_ADD_STREAM:
-               ret = relay_add_stream(recv_hdr, cmd, streams_ht);
+               ret = relay_add_stream(recv_hdr, cmd, ctx->sessions_ht);
                break;
        case RELAYD_START_DATA:
                ret = relay_start(recv_hdr, cmd);
                break;
        case RELAYD_SEND_METADATA:
-               ret = relay_recv_metadata(recv_hdr, cmd, streams_ht);
+               ret = relay_recv_metadata(recv_hdr, cmd);
                break;
        case RELAYD_VERSION:
-               ret = relay_send_version(recv_hdr, cmd, streams_ht);
+               ret = relay_send_version(recv_hdr, cmd, ctx->sessions_ht);
                break;
        case RELAYD_CLOSE_STREAM:
-               ret = relay_close_stream(recv_hdr, cmd, streams_ht);
+               ret = relay_close_stream(recv_hdr, cmd);
                break;
        case RELAYD_DATA_PENDING:
-               ret = relay_data_pending(recv_hdr, cmd, streams_ht);
+               ret = relay_data_pending(recv_hdr, cmd);
                break;
        case RELAYD_QUIESCENT_CONTROL:
-               ret = relay_quiescent_control(recv_hdr, cmd, streams_ht);
+               ret = relay_quiescent_control(recv_hdr, cmd);
                break;
        case RELAYD_BEGIN_DATA_PENDING:
-               ret = relay_begin_data_pending(recv_hdr, cmd, streams_ht);
+               ret = relay_begin_data_pending(recv_hdr, cmd);
                break;
        case RELAYD_END_DATA_PENDING:
-               ret = relay_end_data_pending(recv_hdr, cmd, streams_ht);
+               ret = relay_end_data_pending(recv_hdr, cmd);
                break;
        case RELAYD_SEND_INDEX:
-               ret = relay_recv_index(recv_hdr, cmd, streams_ht, indexes_ht);
+               ret = relay_recv_index(recv_hdr, cmd);
                break;
        case RELAYD_UPDATE_SYNC_INFO:
        default:
@@ -1719,18 +1868,98 @@ end:
        return ret;
 }
 
+/*
+ * Handle index for a data stream.
+ *
+ * RCU read side lock MUST be acquired.
+ *
+ * Return 0 on success else a negative value.
+ */
+static int handle_index_data(struct relay_stream *stream, uint64_t net_seq_num,
+               int rotate_index)
+{
+       int ret = 0, index_created = 0;
+       uint64_t stream_id, data_offset;
+       struct relay_index *index, *wr_index = NULL;
+
+       assert(stream);
+
+       stream_id = stream->stream_handle;
+       /* Get data offset because we are about to update the index. */
+       data_offset = htobe64(stream->tracefile_size_current);
+
+       /*
+        * Lookup for an existing index for that stream id/sequence number. If on
+        * exists, the control thread already received the data for it thus we need
+        * to write it on disk.
+        */
+       index = relay_index_find(stream_id, net_seq_num);
+       if (!index) {
+               /* A successful creation will add the object to the HT. */
+               index = relay_index_create(stream_id, net_seq_num);
+               if (!index) {
+                       ret = -1;
+                       goto error;
+               }
+               index_created = 1;
+       }
+
+       if (rotate_index || stream->index_fd < 0) {
+               index->to_close_fd = stream->index_fd;
+               ret = index_create_file(stream->path_name, stream->channel_name,
+                               relayd_uid, relayd_gid, stream->tracefile_size,
+                               stream->tracefile_count_current);
+               if (ret < 0) {
+                       /* This will close the stream's index fd if one. */
+                       relay_index_free_safe(index);
+                       goto error;
+               }
+               stream->index_fd = ret;
+       }
+       index->fd = stream->index_fd;
+       index->index_data.offset = data_offset;
+
+       if (index_created) {
+               /*
+                * Try to add the relay index object to the hash table. If an object
+                * already exist, destroy back the index created and set the data.
+                */
+               relay_index_add(index, &wr_index);
+               if (wr_index) {
+                       /* Copy back data from the created index. */
+                       wr_index->fd = index->fd;
+                       wr_index->to_close_fd = index->to_close_fd;
+                       wr_index->index_data.offset = data_offset;
+                       free(index);
+               }
+       } else {
+               /* The index already exists so write it on disk. */
+               wr_index = index;
+       }
+
+       /* Do we have a writable ready index to write on disk. */
+       if (wr_index) {
+               ret = relay_index_write(wr_index->fd, wr_index);
+               if (ret < 0) {
+                       goto error;
+               }
+               stream->total_index_received++;
+       }
+
+error:
+       return ret;
+}
+
 /*
  * relay_process_data: Process the data received on the data socket
  */
 static
-int relay_process_data(struct relay_command *cmd, struct lttng_ht *streams_ht,
-               struct lttng_ht *indexes_ht)
+int relay_process_data(struct relay_command *cmd)
 {
-       int ret = 0, rotate_index = 0, index_created = 0;
+       int ret = 0, rotate_index = 0;
        struct relay_stream *stream;
-       struct relay_index *index, *wr_index = NULL;
        struct lttcomm_relayd_data_hdr data_hdr;
-       uint64_t stream_id, data_offset;
+       uint64_t stream_id;
        uint64_t net_seq_num;
        uint32_t data_size;
 
@@ -1750,7 +1979,7 @@ int relay_process_data(struct relay_command *cmd, struct lttng_ht *streams_ht,
        stream_id = be64toh(data_hdr.stream_id);
 
        rcu_read_lock();
-       stream = relay_stream_from_stream_id(stream_id, streams_ht);
+       stream = relay_stream_find_by_id(stream_id);
        if (!stream) {
                ret = -1;
                goto end_rcu_unlock;
@@ -1803,76 +2032,18 @@ int relay_process_data(struct relay_command *cmd, struct lttng_ht *streams_ht,
                rotate_index = 1;
        }
 
-       /* Get data offset because we are about to update the index. */
-       data_offset = htobe64(stream->tracefile_size_current);
-
        /*
-        * Lookup for an existing index for that stream id/sequence number. If on
-        * exists, the control thread already received the data for it thus we need
-        * to write it on disk.
+        * Index are handled in protocol version 2.4 and above. Also, snapshot and
+        * index are NOT supported.
         */
-       index = relay_index_find(stream_id, net_seq_num, indexes_ht);
-       if (!index) {
-               /* A successful creation will add the object to the HT. */
-               index = relay_index_create(stream->stream_handle, net_seq_num);
-               if (!index) {
-                       goto end_rcu_unlock;
-               }
-               index_created = 1;
-       }
-
-       if (rotate_index || stream->index_fd < 0) {
-               index->to_close_fd = stream->index_fd;
-               ret = index_create_file(stream->path_name, stream->channel_name,
-                               relayd_uid, relayd_gid, stream->tracefile_size,
-                               stream->tracefile_count_current);
-               if (ret < 0) {
-                       /* This will close the stream's index fd if one. */
-                       relay_index_free_safe(index);
-                       goto end_rcu_unlock;
-               }
-               stream->index_fd = ret;
-       }
-       index->fd = stream->index_fd;
-       index->index_data.offset = data_offset;
-
-       if (index_created) {
-               /*
-                * Try to add the relay index object to the hash table. If an object
-                * already exist, destroy back the index created and set the data.
-                */
-               relay_index_add(index, indexes_ht, &wr_index);
-               if (wr_index) {
-                       /* Copy back data from the created index. */
-                       wr_index->fd = index->fd;
-                       wr_index->to_close_fd = index->to_close_fd;
-                       wr_index->index_data.offset = data_offset;
-                       free(index);
-               }
-       } else {
-               /* The index already exists so write it on disk. */
-               wr_index = index;
-       }
-
-       /* Do we have a writable ready index to write on disk. */
-       if (wr_index) {
-               /* Starting at 2.4, create the index file if none available. */
-               if (cmd->minor >= 4 && stream->index_fd < 0) {
-                       ret = index_create_file(stream->path_name, stream->channel_name,
-                                       relayd_uid, relayd_gid, stream->tracefile_size,
-                                       stream->tracefile_count_current);
-                       if (ret < 0) {
-                               goto end_rcu_unlock;
-                       }
-                       stream->index_fd = ret;
-               }
-
-               ret = relay_index_write(wr_index->fd, wr_index, indexes_ht);
+       if (stream->session->minor >= 4 && !stream->session->snapshot) {
+               ret = handle_index_data(stream, net_seq_num, rotate_index);
                if (ret < 0) {
                        goto end_rcu_unlock;
                }
        }
 
+       /* Write data to stream output fd. */
        do {
                ret = write(stream->fd, data_buffer, data_size);
        } while (ret < 0 && errno == EINTR);
@@ -1895,24 +2066,7 @@ int relay_process_data(struct relay_command *cmd, struct lttng_ht *streams_ht,
 
        /* Check if we need to close the FD */
        if (close_stream_check(stream)) {
-               int cret;
-               struct lttng_ht_iter iter;
-
-               cret = close(stream->fd);
-               if (cret < 0) {
-                       PERROR("close stream process data");
-               }
-
-               cret = close(stream->index_fd);
-               if (cret < 0) {
-                       PERROR("close stream index_fd");
-               }
-               iter.iter.node = &stream->stream_n.node;
-               ret = lttng_ht_del(streams_ht, &iter);
-               assert(!ret);
-               call_rcu(&stream->rcu_node,
-                       deferred_free_stream);
-               DBG("Closed tracefile %d after recv data", stream->fd);
+               destroy_stream(stream, cmd->ctf_traces_ht);
        }
 
 end_rcu_unlock:
@@ -1954,6 +2108,11 @@ int relay_add_connection(int fd, struct lttng_poll_event *events,
                goto error_read;
        }
 
+       relay_connection->ctf_traces_ht = lttng_ht_new(0, LTTNG_HT_TYPE_STRING);
+       if (!relay_connection->ctf_traces_ht) {
+               goto error_read;
+       }
+
        lttng_ht_node_init_ulong(&relay_connection->sock_n,
                        (unsigned long) relay_connection->sock->fd);
        rcu_read_lock();
@@ -1976,21 +2135,22 @@ void deferred_free_connection(struct rcu_head *head)
        struct relay_command *relay_connection =
                caa_container_of(head, struct relay_command, rcu_node);
 
+       lttng_ht_destroy(relay_connection->ctf_traces_ht);
        lttcomm_destroy_sock(relay_connection->sock);
        free(relay_connection);
 }
 
 static
 void relay_del_connection(struct lttng_ht *relay_connections_ht,
-               struct lttng_ht *streams_ht, struct lttng_ht_iter *iter,
-               struct relay_command *relay_connection)
+               struct lttng_ht_iter *iter, struct relay_command *relay_connection,
+               struct lttng_ht *sessions_ht)
 {
        int ret;
 
        ret = lttng_ht_del(relay_connections_ht, iter);
        assert(!ret);
        if (relay_connection->type == RELAY_CONTROL) {
-               relay_delete_session(relay_connection, streams_ht);
+               relay_delete_session(relay_connection, sessions_ht);
        }
 
        call_rcu(&relay_connection->rcu_node,
@@ -2010,26 +2170,24 @@ void *relay_thread_worker(void *data)
        struct lttng_ht *relay_connections_ht;
        struct lttng_ht_node_ulong *node;
        struct lttng_ht_iter iter;
-       struct lttng_ht *streams_ht;
-       struct lttng_ht *index_streams_ht;
        struct lttcomm_relayd_hdr recv_hdr;
+       struct relay_local_data *relay_ctx = (struct relay_local_data *) data;
+       struct lttng_ht *sessions_ht = relay_ctx->sessions_ht;
 
        DBG("[thread] Relay worker started");
 
        rcu_register_thread();
 
+       health_register(health_relayd, HEALTH_RELAYD_TYPE_WORKER);
+
+       health_code_update();
+
        /* table of connections indexed on socket */
        relay_connections_ht = lttng_ht_new(0, LTTNG_HT_TYPE_ULONG);
        if (!relay_connections_ht) {
                goto relay_connections_ht_error;
        }
 
-       /* tables of streams indexed by stream ID */
-       streams_ht = lttng_ht_new(0, LTTNG_HT_TYPE_ULONG);
-       if (!streams_ht) {
-               goto streams_ht_error;
-       }
-
        /* Tables of received indexes indexed by index handle and net_seq_num. */
        indexes_ht = lttng_ht_new(0, LTTNG_HT_TYPE_TWO_U64);
        if (!indexes_ht) {
@@ -2050,9 +2208,13 @@ restart:
        while (1) {
                int idx = -1, i, seen_control = 0, last_notdel_data_fd = -1;
 
+               health_code_update();
+
                /* Infinite blocking call, waiting for transmission */
                DBG3("Relayd worker thread polling...");
+               health_poll_entry();
                ret = lttng_poll_wait(&events, -1);
+               health_poll_exit();
                if (ret < 0) {
                        /*
                         * Restart interrupted system call.
@@ -2075,6 +2237,8 @@ restart:
                        uint32_t revents = LTTNG_POLL_GETEV(&events, i);
                        int pollfd = LTTNG_POLL_GETFD(&events, i);
 
+                       health_code_update();
+
                        /* Thread quit pipe has been closed. Killing thread. */
                        ret = check_thread_quit_pipe(pollfd, revents);
                        if (ret) {
@@ -2113,8 +2277,7 @@ restart:
                                        ERR("POLL ERROR");
                                        relay_cleanup_poll_connection(&events, pollfd);
                                        relay_del_connection(relay_connections_ht,
-                                                       streams_ht, &iter,
-                                                       relay_connection);
+                                                       &iter, relay_connection, sessions_ht);
                                        if (last_seen_data_fd == pollfd) {
                                                last_seen_data_fd = last_notdel_data_fd;
                                        }
@@ -2122,8 +2285,7 @@ restart:
                                        DBG("Socket %d hung up", pollfd);
                                        relay_cleanup_poll_connection(&events, pollfd);
                                        relay_del_connection(relay_connections_ht,
-                                                       streams_ht, &iter,
-                                                       relay_connection);
+                                                       &iter, relay_connection, sessions_ht);
                                        if (last_seen_data_fd == pollfd) {
                                                last_seen_data_fd = last_notdel_data_fd;
                                        }
@@ -2137,8 +2299,7 @@ restart:
                                                if (ret <= 0) {
                                                        relay_cleanup_poll_connection(&events, pollfd);
                                                        relay_del_connection(relay_connections_ht,
-                                                                       streams_ht, &iter,
-                                                                       relay_connection);
+                                                                       &iter, relay_connection, sessions_ht);
                                                        DBG("Control connection closed with %d", pollfd);
                                                } else {
                                                        if (relay_connection->session) {
@@ -2146,16 +2307,12 @@ restart:
                                                                                relay_connection->session->id);
                                                        }
                                                        ret = relay_process_control(&recv_hdr,
-                                                                       relay_connection,
-                                                                       streams_ht,
-                                                                       index_streams_ht,
-                                                                       indexes_ht);
+                                                                       relay_connection, relay_ctx);
                                                        if (ret < 0) {
                                                                /* Clear the session on error. */
                                                                relay_cleanup_poll_connection(&events, pollfd);
                                                                relay_del_connection(relay_connections_ht,
-                                                                               streams_ht, &iter,
-                                                                               relay_connection);
+                                                                               &iter, relay_connection, sessions_ht);
                                                                DBG("Connection closed with %d", pollfd);
                                                        }
                                                        seen_control = 1;
@@ -2184,6 +2341,9 @@ restart:
                if (last_seen_data_fd >= 0) {
                        for (i = 0; i < nb_fd; i++) {
                                int pollfd = LTTNG_POLL_GETFD(&events, i);
+
+                               health_code_update();
+
                                if (last_seen_data_fd == pollfd) {
                                        idx = i;
                                        break;
@@ -2197,6 +2357,8 @@ restart:
                        uint32_t revents = LTTNG_POLL_GETEV(&events, i);
                        int pollfd = LTTNG_POLL_GETFD(&events, i);
 
+                       health_code_update();
+
                        /* Skip the command pipe. It's handled in the first loop. */
                        if (pollfd == relay_cmd_pipe[0]) {
                                continue;
@@ -2221,14 +2383,12 @@ restart:
                                                continue;
                                        }
 
-                                       ret = relay_process_data(relay_connection, streams_ht,
-                                                       indexes_ht);
+                                       ret = relay_process_data(relay_connection);
                                        /* connection closed */
                                        if (ret < 0) {
                                                relay_cleanup_poll_connection(&events, pollfd);
                                                relay_del_connection(relay_connections_ht,
-                                                               streams_ht, &iter,
-                                                               relay_connection);
+                                                               &iter, relay_connection, sessions_ht);
                                                DBG("Data connection closed with %d", pollfd);
                                                /*
                                                 * Every goto restart call sets the last seen fd where
@@ -2248,6 +2408,9 @@ restart:
                last_seen_data_fd = -1;
        }
 
+       /* Normal exit, no error */
+       ret = 0;
+
 exit:
 error:
        lttng_poll_clean(&events);
@@ -2255,21 +2418,20 @@ error:
        /* empty the hash table and free the memory */
        rcu_read_lock();
        cds_lfht_for_each_entry(relay_connections_ht->ht, &iter.iter, node, node) {
+               health_code_update();
+
                node = lttng_ht_iter_get_node_ulong(&iter);
                if (node) {
                        relay_connection = caa_container_of(node,
                                        struct relay_command, sock_n);
                        relay_del_connection(relay_connections_ht,
-                                       streams_ht, &iter,
-                                       relay_connection);
+                                       &iter, relay_connection, sessions_ht);
                }
        }
        rcu_read_unlock();
 error_poll_create:
        lttng_ht_destroy(indexes_ht);
 indexes_ht_error:
-       lttng_ht_destroy(streams_ht);
-streams_ht_error:
        lttng_ht_destroy(relay_connections_ht);
 relay_connections_ht_error:
        /* Close relay cmd pipes */
@@ -2279,8 +2441,13 @@ relay_connections_ht_error:
        }
        DBG("Worker thread cleanup complete");
        free(data_buffer);
-       stop_threads();
+       if (err) {
+               health_error();
+               ERR("Health error occurred in %s", __func__);
+       }
+       health_unregister(health_relayd);
        rcu_unregister_thread();
+       stop_threads();
        return NULL;
 }
 
@@ -2304,6 +2471,7 @@ int main(int argc, char **argv)
 {
        int ret = 0;
        void *status;
+       struct relay_local_data *relay_ctx;
 
        /* Create thread quit pipe */
        if ((ret = init_thread_quit_pipe()) < 0) {
@@ -2370,6 +2538,37 @@ int main(int argc, char **argv)
        /* Initialize communication library */
        lttcomm_init();
 
+       relay_ctx = zmalloc(sizeof(struct relay_local_data));
+       if (!relay_ctx) {
+               PERROR("relay_ctx");
+               goto exit;
+       }
+
+       /* tables of sessions indexed by session ID */
+       relay_ctx->sessions_ht = lttng_ht_new(0, LTTNG_HT_TYPE_ULONG);
+       if (!relay_ctx->sessions_ht) {
+               goto exit_relay_ctx_sessions;
+       }
+
+       /* tables of streams indexed by stream ID */
+       relay_streams_ht = lttng_ht_new(0, LTTNG_HT_TYPE_ULONG);
+       if (!relay_streams_ht) {
+               goto exit_relay_ctx_streams;
+       }
+
+       /* tables of streams indexed by stream ID */
+       viewer_streams_ht = lttng_ht_new(0, LTTNG_HT_TYPE_U64);
+       if (!viewer_streams_ht) {
+               goto exit_relay_ctx_viewer_streams;
+       }
+
+       /* Initialize thread health monitoring */
+       health_relayd = health_app_create(NR_HEALTH_RELAYD_TYPES);
+       if (!health_relayd) {
+               PERROR("health_app_create error");
+               goto exit_health_app_create;
+       }
+
        /* Setup the dispatcher thread */
        ret = pthread_create(&dispatcher_thread, NULL,
                        relay_thread_dispatcher, (void *) NULL);
@@ -2380,7 +2579,7 @@ int main(int argc, char **argv)
 
        /* Setup the worker thread */
        ret = pthread_create(&worker_thread, NULL,
-                       relay_thread_worker, (void *) NULL);
+                       relay_thread_worker, (void *) relay_ctx);
        if (ret != 0) {
                PERROR("pthread_create worker");
                goto exit_worker;
@@ -2394,27 +2593,50 @@ int main(int argc, char **argv)
                goto exit_listener;
        }
 
-exit_listener:
+       ret = live_start_threads(live_uri, relay_ctx, thread_quit_pipe);
+       if (ret != 0) {
+               ERR("Starting live viewer threads");
+               goto exit_live;
+       }
+
+       live_stop_threads();
+
+exit_live:
        ret = pthread_join(listener_thread, &status);
        if (ret != 0) {
                PERROR("pthread_join");
                goto error;     /* join error, exit without cleanup */
        }
 
-exit_worker:
+exit_listener:
        ret = pthread_join(worker_thread, &status);
        if (ret != 0) {
                PERROR("pthread_join");
                goto error;     /* join error, exit without cleanup */
        }
 
-exit_dispatcher:
+exit_worker:
        ret = pthread_join(dispatcher_thread, &status);
        if (ret != 0) {
                PERROR("pthread_join");
                goto error;     /* join error, exit without cleanup */
        }
 
+exit_dispatcher:
+       health_app_destroy(health_relayd);
+
+exit_health_app_create:
+       lttng_ht_destroy(viewer_streams_ht);
+
+exit_relay_ctx_viewer_streams:
+       lttng_ht_destroy(relay_streams_ht);
+
+exit_relay_ctx_streams:
+       lttng_ht_destroy(relay_ctx->sessions_ht);
+
+exit_relay_ctx_sessions:
+       free(relay_ctx);
+
 exit:
        cleanup();
        if (!ret) {
This page took 0.03831 seconds and 4 git commands to generate.