From 6a4e403927ffef4cae8726064dcf53c463eb128c Mon Sep 17 00:00:00 2001 From: =?utf8?q?J=C3=A9r=C3=A9mie=20Galarneau?= Date: Sat, 25 Jul 2015 17:48:12 -0400 Subject: [PATCH] Fix: clean-up agent app hash table from the main sessiond thread MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The agent application hash table, which is allocated by the session daemon's main thread, is free'd from the agent application registration thread. This leads to a number of interesting scenarios under which the agent app registration thread may encounter an error, thus tearing itself down and freeing the agent_apps_ht_by_sock hash table. Of course, nothing then prevents the client processing thread from accessing this invalidated hash table to list, enable or disable agent events which leads to crashes or assertions hitting in ht_match_reg_uid(). However, it is not necessary for the agent app registration thread to encounter an error for this to prove problematic. As shown in bug #893, the session daemon's teardown will assert on a NULL key in ht_match_reg_uid() whenever it is performed while a JUL, Log4J or Python event is still enabled in a session. This happens because the session daemon's clean-up triggers the destruction of all sessions. The destruction of those sessions would access the free'd agent_apps_ht_by_sock to disable the registered agent events. Fixes #893 Signed-off-by: Jérémie Galarneau --- src/bin/lttng-sessiond/agent-thread.c | 57 ++------------------------- src/bin/lttng-sessiond/agent.c | 56 ++++++++++++++++++++++++-- src/bin/lttng-sessiond/agent.h | 7 +++- src/bin/lttng-sessiond/main.c | 12 ++++-- 4 files changed, 69 insertions(+), 63 deletions(-) diff --git a/src/bin/lttng-sessiond/agent-thread.c b/src/bin/lttng-sessiond/agent-thread.c index 9ac975cbf..d1bb122c1 100644 --- a/src/bin/lttng-sessiond/agent-thread.c +++ b/src/bin/lttng-sessiond/agent-thread.c @@ -28,6 +28,7 @@ #include "fd-limit.h" #include "agent-thread.h" +#include "agent.h" #include "lttng-sessiond.h" #include "session.h" #include "utils.h" @@ -73,53 +74,6 @@ static void update_agent_app(struct agent_app *app) session_unlock_list(); } -/* - * Destroy a agent application by socket. - */ -static void destroy_agent_app(int sock) -{ - struct agent_app *app; - - assert(sock >= 0); - - /* - * Not finding an application is a very important error that should NEVER - * happen. The hash table deletion is ONLY done through this call even on - * thread cleanup. - */ - rcu_read_lock(); - app = agent_find_app_by_sock(sock); - assert(app); - - /* RCU read side lock is assumed to be held by this function. */ - agent_delete_app(app); - - /* The application is freed in a RCU call but the socket is closed here. */ - agent_destroy_app(app); - rcu_read_unlock(); -} - -/* - * Cleanup remaining agent apps in the hash table. This should only be called in - * the exit path of the thread. - */ -static void clean_agent_apps_ht(void) -{ - struct lttng_ht_node_ulong *node; - struct lttng_ht_iter iter; - - DBG3("[agent-thread] Cleaning agent apps ht"); - - rcu_read_lock(); - cds_lfht_for_each_entry(agent_apps_ht_by_sock->ht, &iter.iter, node, node) { - struct agent_app *app; - - app = caa_container_of(node, struct agent_app, node); - destroy_agent_app(app->sock->fd); - } - rcu_read_unlock(); -} - /* * Create and init socket from uri. */ @@ -353,7 +307,7 @@ restart: goto error; } - destroy_agent_app(pollfd); + agent_destroy_app_by_sock(pollfd); } else if (revents & (LPOLLIN)) { int new_fd; struct agent_app *app = NULL; @@ -374,7 +328,7 @@ restart: ret = lttng_poll_add(&events, new_fd, LPOLLERR | LPOLLHUP | LPOLLRDHUP); if (ret < 0) { - destroy_agent_app(new_fd); + agent_destroy_app_by_sock(new_fd); continue; } @@ -400,11 +354,6 @@ error_tcp_socket: error_poll_create: DBG("[agent-thread] is cleaning up and stopping."); - if (agent_apps_ht_by_sock) { - clean_agent_apps_ht(); - lttng_ht_destroy(agent_apps_ht_by_sock); - } - rcu_thread_offline(); rcu_unregister_thread(); return NULL; diff --git a/src/bin/lttng-sessiond/agent.c b/src/bin/lttng-sessiond/agent.c index 5d1bbccae..45a3045dd 100644 --- a/src/bin/lttng-sessiond/agent.c +++ b/src/bin/lttng-sessiond/agent.c @@ -961,16 +961,64 @@ void agent_destroy(struct agent *agt) } /* - * Initialize agent subsystem. + * Allocate agent_apps_ht_by_sock. */ -int agent_setup(void) +int agent_app_ht_alloc(void) { + int ret = 0; + agent_apps_ht_by_sock = lttng_ht_new(0, LTTNG_HT_TYPE_ULONG); if (!agent_apps_ht_by_sock) { - return -1; + ret = -1; } - return 0; + return ret; +} + +/* + * Destroy a agent application by socket. + */ +void agent_destroy_app_by_sock(int sock) +{ + struct agent_app *app; + + assert(sock >= 0); + + /* + * Not finding an application is a very important error that should NEVER + * happen. The hash table deletion is ONLY done through this call when the + * main sessiond thread is torn down. + */ + rcu_read_lock(); + app = agent_find_app_by_sock(sock); + assert(app); + + /* RCU read side lock is assumed to be held by this function. */ + agent_delete_app(app); + + /* The application is freed in a RCU call but the socket is closed here. */ + agent_destroy_app(app); + rcu_read_unlock(); +} + +/* + * Clean-up the agent app hash table and destroy it. + */ +void agent_app_ht_clean(void) +{ + struct lttng_ht_node_ulong *node; + struct lttng_ht_iter iter; + + rcu_read_lock(); + cds_lfht_for_each_entry(agent_apps_ht_by_sock->ht, &iter.iter, node, node) { + struct agent_app *app; + + app = caa_container_of(node, struct agent_app, node); + agent_destroy_app_by_sock(app->sock->fd); + } + rcu_read_unlock(); + + lttng_ht_destroy(agent_apps_ht_by_sock); } /* diff --git a/src/bin/lttng-sessiond/agent.h b/src/bin/lttng-sessiond/agent.h index 6564b88c8..d08ad6e22 100644 --- a/src/bin/lttng-sessiond/agent.h +++ b/src/bin/lttng-sessiond/agent.h @@ -117,8 +117,10 @@ struct agent { struct lttng_ht_node_u64 node; }; -/* Setup agent subsystem. */ -int agent_setup(void); +/* Allocate agent apps hash table */ +int agent_app_ht_alloc(void); +/* Clean-up agent apps hash table */ +void agent_app_ht_clean(void); /* Initialize an already allocated agent domain. */ int agent_init(struct agent *agt); @@ -145,6 +147,7 @@ void agent_add_app(struct agent_app *app); void agent_delete_app(struct agent_app *app); struct agent_app *agent_find_app_by_sock(int sock); void agent_destroy_app(struct agent_app *app); +void agent_destroy_app_by_sock(int sock); int agent_send_registration_done(struct agent_app *app); /* Agent action API */ diff --git a/src/bin/lttng-sessiond/main.c b/src/bin/lttng-sessiond/main.c index aff833243..2b99bbc99 100644 --- a/src/bin/lttng-sessiond/main.c +++ b/src/bin/lttng-sessiond/main.c @@ -676,6 +676,9 @@ static void sessiond_cleanup(void) } } + DBG("Cleaning up all agent apps"); + agent_app_ht_clean(); + DBG("Closing all UST sockets"); ust_app_clean_list(); buffer_reg_destroy_registries(); @@ -5635,9 +5638,12 @@ int main(int argc, char **argv) goto exit_init_data; } - /* Initialize agent domain subsystem. */ - if (agent_setup()) { - /* ENOMEM at this point. */ + /* + * Initialize agent app hash table. We allocate the hash table here + * since cleanup() can get called after this point. + */ + if (agent_app_ht_alloc()) { + ERR("Failed to allocate Agent app hash table"); retval = -1; goto exit_init_data; } -- 2.34.1