Tests: Add test to check shared-memory FD leaks after relayd dies master
authorKienan Stewart <kstewart@efficios.com>
Wed, 20 Mar 2024 20:02:53 +0000 (16:02 -0400)
committerJérémie Galarneau <jeremie.galarneau@efficios.com>
Wed, 24 Apr 2024 20:36:36 +0000 (16:36 -0400)
Refs: https://bugs.lttng.org/issues/1411

Change-Id: I9804011320c28a9867af1fdc6a8d82ad0671fe3d
Signed-off-by: Kienan Stewart <kstewart@efficios.com>
Signed-off-by: Jérémie Galarneau <jeremie.galarneau@efficios.com>
tests/regression/Makefile.am
tests/regression/tools/live/Makefile.am
tests/regression/tools/live/test_per_application_leaks.py [new file with mode: 0755]
tests/utils/lttngtest/environment.py

index ca3fb2d52d281968f9ffb9daeca2e47035c4bb53..da5cb1b57c1d88b96cac9f178599bceb50ee7b7d 100644 (file)
@@ -16,6 +16,7 @@ TESTS = tools/base-path/test_ust \
        tools/health/test_thread_ok \
        tools/live/test_kernel \
        tools/live/test_lttng_kernel \
        tools/health/test_thread_ok \
        tools/live/test_kernel \
        tools/live/test_lttng_kernel \
+       tools/live/test_per_application_leaks.py \
        tools/live/test_ust \
        tools/live/test_ust_tracefile_count \
        tools/live/test_lttng_ust \
        tools/live/test_ust \
        tools/live/test_ust_tracefile_count \
        tools/live/test_lttng_ust \
index 494b982df1989ec0d4698b39f443f7b41c8176cd..3ad9b7ba35f23e4b5a296c3dc7391acbc6d7e430 100644 (file)
@@ -6,7 +6,7 @@ LIBTAP=$(top_builddir)/tests/utils/tap/libtap.la
 LIBLTTNG_SESSIOND_COMMON=$(top_builddir)/src/bin/lttng-sessiond/liblttng-sessiond-common.la
 
 noinst_PROGRAMS = live_test
 LIBLTTNG_SESSIOND_COMMON=$(top_builddir)/src/bin/lttng-sessiond/liblttng-sessiond-common.la
 
 noinst_PROGRAMS = live_test
-EXTRA_DIST = test_kernel test_lttng_kernel
+EXTRA_DIST = test_kernel test_lttng_kernel test_per_application_leaks.py
 
 if HAVE_LIBLTTNG_UST_CTL
 EXTRA_DIST += test_ust test_ust_tracefile_count test_lttng_ust
 
 if HAVE_LIBLTTNG_UST_CTL
 EXTRA_DIST += test_ust test_ust_tracefile_count test_lttng_ust
diff --git a/tests/regression/tools/live/test_per_application_leaks.py b/tests/regression/tools/live/test_per_application_leaks.py
new file mode 100755 (executable)
index 0000000..1d12049
--- /dev/null
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+#
+# SPDX-FileCyoprightText: Kienan Stewart <kstewart@efficios.com>
+# SPDX-License-Identifier: GPL-2.0-only
+
+"""
+Test that the consumerd doesn't leak file descriptor allocations in /dev/shm
+when the relayd exits before instrumented applications start.
+
+@see https://bugs.lttng.org/issues/1411
+"""
+
+import os
+import pathlib
+import subprocess
+import sys
+
+test_utils_import_path = pathlib.Path(__file__).absolute().parents[3] / "utils"
+sys.path.append(str(test_utils_import_path))
+
+import lttngtest
+
+
+def get_consumerd_pid(tap, parent, match_string):
+    pid = 0
+    try:
+        process = subprocess.Popen(
+            ["pgrep", "-P", str(parent), "-f", match_string],
+            stdout=subprocess.PIPE,
+        )
+        process.wait()
+        output = str(process.stdout.read(), encoding="UTF-8").splitlines()
+        if len(output) != 1:
+            raise Exception(
+                "Unexpected number of output lines (got {}): {}".format(
+                    len(output), output
+                )
+            )
+        pid = int(output[0])
+    except Exception as e:
+        tap.diagnostic(
+            "Failed to find child process of '{}' matching '{}': '{}'".format(
+                parent, match_string, str(e)
+            )
+        )
+    return pid
+
+
+def count_process_dev_shm_fds(pid):
+    count = 0
+    if pid == 0:
+        return count
+    dir = os.path.join("/proc", str(pid), "fd")
+    for root, dirs, files in os.walk(dir):
+        for f in files:
+            filename = pathlib.Path(os.path.join(root, f))
+            try:
+                if filename.is_symlink() and str(filename.resolve()).startswith(
+                    "/dev/shm/shm-ust-consumer"
+                ):
+                    count += 1
+            except FileNotFoundError:
+                # As we're walking /proc/XX/fd/, fds may be added or removed
+                continue
+    return count
+
+
+def count_dev_shm_fds(tap, test_env):
+    consumer32_pid = get_consumerd_pid(tap, test_env._sessiond.pid, "ustconsumerd32")
+    fds_consumerd32 = count_process_dev_shm_fds(consumer32_pid)
+    consumer64_pid = get_consumerd_pid(tap, test_env._sessiond.pid, "ustconsumerd64")
+    fds_consumerd64 = count_process_dev_shm_fds(consumer64_pid)
+    return (fds_consumerd32, fds_consumerd64)
+
+
+def test_fd_leak(tap, test_env, buffer_sharing_policy, kill_relayd=True):
+    tap.diagnostic(
+        "test_fd_leak with buffer sharing policy {}, kill relayd: {}".format(
+            buffer_sharing_policy, kill_relayd
+        )
+    )
+    client = lttngtest.LTTngClient(test_env, log=tap.diagnostic)
+    output = lttngtest.NetworkSessionOutputLocation(
+        "net://localhost:{}:{}/".format(
+            test_env.lttng_relayd_control_port, test_env.lttng_relayd_data_port
+        )
+    )
+
+    session = client.create_session(output=output, live=True)
+    channel = session.add_channel(
+        lttngtest.lttngctl.TracingDomain.User,
+        buffer_sharing_policy=buffer_sharing_policy,
+    )
+    channel.add_recording_rule(lttngtest.lttngctl.UserTracepointEventRule())
+    session.start()
+
+    count_post_start = count_dev_shm_fds(tap, test_env)
+
+    # Kill the relayd
+    if kill_relayd:
+        test_env._terminate_relayd()
+
+    test_env.launch_wait_trace_test_application(10)
+    count_post_app1 = count_dev_shm_fds(tap, test_env)
+
+    test_env.launch_wait_trace_test_application(10)
+    count_post_app2 = count_dev_shm_fds(tap, test_env)
+
+    test_env.launch_wait_trace_test_application(10)
+    count_post_app3 = count_dev_shm_fds(tap, test_env)
+
+    session.stop()
+    session.destroy()
+
+    count_post_destroy = count_dev_shm_fds(tap, test_env)
+
+    tap.diagnostic(
+        "FD counts post-start: {}, post-destroy: {}".format(
+            count_post_start, count_post_destroy
+        )
+    )
+    tap.test(
+        count_post_start == count_post_destroy,
+        "Count of consumerd FDs in /dev/shm are equal after session start then after destroy",
+    )
+
+    tap.diagnostic(
+        "FD counts post-app-1: {}, post-app-2: {}, post-app-3: {}".format(
+            count_post_app1, count_post_app2, count_post_app3
+        )
+    )
+    if buffer_sharing_policy == lttngtest.lttngctl.BufferSharingPolicy.PerUID:
+        tap.test(
+            (count_post_app1 == count_post_app2)
+            and (count_post_app2 == count_post_app3),
+            "Count of consumerd FDs in /dev/shm doesn't leak over several application invocations",
+        )
+    else:
+        tap.skip(
+            "Count of consumerds FDs in /dev/shm doesn't leak over several application invocations - no mechanism is available to guarantee buffer reclamation within a given time frame"
+        )
+
+
+tap = lttngtest.TapGenerator(8)
+for kill_relayd in [True, False]:
+    for buffer_sharing_policy in [
+        lttngtest.lttngctl.BufferSharingPolicy.PerUID,
+        lttngtest.lttngctl.BufferSharingPolicy.PerPID,
+    ]:
+        with lttngtest.test_environment(
+            log=tap.diagnostic, with_relayd=True, with_sessiond=True
+        ) as test_env:
+            test_fd_leak(tap, test_env, buffer_sharing_policy, kill_relayd)
+
+sys.exit(0 if tap.is_successful else 1)
index 74dcfb8d26cebf89c66fbd187d77e4eef1a0dcc6..73d18874b293f1fa23e7c7e2dd267b33e81dd149 100644 (file)
@@ -618,14 +618,20 @@ class _Environment(logger._Logger):
         Launch an application that will trace from within constructors.
         """
         return _TraceTestApplication(
         Launch an application that will trace from within constructors.
         """
         return _TraceTestApplication(
-            self._project_root
-            / "tests"
-            / "utils"
-            / "testapp"
-            / subpath,
+            self._project_root / "tests" / "utils" / "testapp" / subpath,
             self,
         )
 
             self,
         )
 
+    def _terminate_relayd(self):
+        if self._relayd and self._relayd.poll() is None:
+            self._relayd.terminate()
+            self._relayd.wait()
+            if self._relayd_output_consumer:
+                self._relayd_output_consumer.join()
+                self._relayd_output_consumer = None
+            self._log("Relayd killed")
+            self._relayd = None
+
     # Clean-up managed processes
     def _cleanup(self):
         # type: () -> None
     # Clean-up managed processes
     def _cleanup(self):
         # type: () -> None
@@ -646,14 +652,7 @@ class _Environment(logger._Logger):
             self._log("Session daemon killed")
             self._sessiond = None
 
             self._log("Session daemon killed")
             self._sessiond = None
 
-        if self._relayd and self._relayd.poll() is None:
-            self._relayd.terminate()
-            self._relayd.wait()
-            if self._relayd_output_consumer:
-                self._relayd_output_consumer.join()
-                self._relayd_output_consumer = None
-            self._log("Relayd killed")
-            self._relayd = None
+        self._terminate_relayd()
 
         self._lttng_home = None
 
 
         self._lttng_home = None
 
This page took 0.033843 seconds and 4 git commands to generate.