tests: Make test_per_application_leaks more robust
[lttng-tools.git] / tests / regression / tools / live / test_per_application_leaks.py
1 #!/usr/bin/env python3
2 #
3 # SPDX-FileCyoprightText: Kienan Stewart <kstewart@efficios.com>
4 # SPDX-License-Identifier: GPL-2.0-only
5
6 """
7 Test that the consumerd doesn't leak file descriptor allocations in /dev/shm
8 when the relayd exits before instrumented applications start.
9
10 @see https://bugs.lttng.org/issues/1411
11 """
12
13 import os
14 import pathlib
15 import subprocess
16 import sys
17 import time
18
19 test_utils_import_path = pathlib.Path(__file__).absolute().parents[3] / "utils"
20 sys.path.append(str(test_utils_import_path))
21
22 import lttngtest
23
24
25 def get_consumerd_pid(tap, parent, match_string):
26 pid = None
27 try:
28 process = subprocess.Popen(
29 ["pgrep", "-P", str(parent), "-f", match_string],
30 stdout=subprocess.PIPE,
31 )
32 process.wait()
33 output = str(process.stdout.read(), encoding="UTF-8").splitlines()
34 if len(output) > 1:
35 raise Exception(
36 "Unexpected number of output lines (got {}): {}".format(
37 len(output), output
38 )
39 )
40 elif len(output) == 1:
41 pid = int(output[0])
42 except Exception as e:
43 tap.diagnostic(
44 "Failed to find child process of '{}' matching '{}': '{}'".format(
45 parent, match_string, str(e)
46 )
47 )
48 return pid
49
50
51 def count_process_dev_shm_fds(pid):
52 count = 0
53 if pid is None:
54 return count
55 dir = os.path.join("/proc", str(pid), "fd")
56 for root, dirs, files in os.walk(dir):
57 for f in files:
58 filename = pathlib.Path(os.path.join(root, f))
59 try:
60 # The symlink in /proc/PID may exist, but point to an unlinked
61 # file - shm_unlink is called but either the kernel hasn't yet
62 # finished the clean-up or the consumer hasn't called close()
63 # on the FD yet.
64 if filename.is_symlink() and str(filename.resolve()).startswith(
65 "/dev/shm/shm-ust-consumer"
66 ):
67 count += 1
68 except FileNotFoundError:
69 # As /proc/XX/fd/ is being walked, fds may be added or removed
70 continue
71 return count
72
73
74 def count_dev_shm_fds(tap, test_env):
75 consumer32_pid = get_consumerd_pid(tap, test_env._sessiond.pid, "ustconsumerd32")
76 fds_consumerd32 = count_process_dev_shm_fds(consumer32_pid)
77 consumer64_pid = get_consumerd_pid(tap, test_env._sessiond.pid, "ustconsumerd64")
78 fds_consumerd64 = count_process_dev_shm_fds(consumer64_pid)
79 return (fds_consumerd32, fds_consumerd64)
80
81
82 def test_fd_leak(tap, test_env, buffer_sharing_policy, kill_relayd=True):
83 tap.diagnostic(
84 "test_fd_leak with buffer sharing policy {}, kill relayd: {}".format(
85 buffer_sharing_policy, kill_relayd
86 )
87 )
88 client = lttngtest.LTTngClient(test_env, log=tap.diagnostic)
89 output = lttngtest.NetworkSessionOutputLocation(
90 "net://localhost:{}:{}/".format(
91 test_env.lttng_relayd_control_port, test_env.lttng_relayd_data_port
92 )
93 )
94
95 session = client.create_session(output=output, live=True)
96 channel = session.add_channel(
97 lttngtest.lttngctl.TracingDomain.User,
98 buffer_sharing_policy=buffer_sharing_policy,
99 )
100 channel.add_recording_rule(lttngtest.lttngctl.UserTracepointEventRule())
101 session.start()
102
103 count_post_start = count_dev_shm_fds(tap, test_env)
104
105 # Kill the relayd
106 if kill_relayd:
107 test_env._terminate_relayd()
108
109 test_env.launch_wait_trace_test_application(10)
110 count_post_app1 = count_dev_shm_fds(tap, test_env)
111
112 test_env.launch_wait_trace_test_application(10)
113 count_post_app2 = count_dev_shm_fds(tap, test_env)
114
115 test_env.launch_wait_trace_test_application(10)
116 count_post_app3 = count_dev_shm_fds(tap, test_env)
117
118 session.stop()
119 session.destroy()
120
121 # As there is not method to know exactly when the final close of the
122 # shm happens (it is timing dependant from an external point of view),
123 # this test iterates waiting for the post-destroy count to reach the
124 # post-start count. In a failure, this will loop infinitely.
125 tap.diagnostic(
126 "Waiting for post-destroy shm count to drop back to post-start level"
127 )
128 while True:
129 count_post_destroy = count_dev_shm_fds(tap, test_env)
130 if count_post_destroy == count_post_start:
131 break
132 time.sleep(0.1)
133
134 tap.diagnostic(
135 "FD counts post-start: {}, post-destroy: {}".format(
136 count_post_start, count_post_destroy
137 )
138 )
139 tap.test(
140 count_post_start == count_post_destroy,
141 "Count of consumerd FDs in /dev/shm are equal after session start then after destroy",
142 )
143
144 tap.diagnostic(
145 "FD counts post-app-1: {}, post-app-2: {}, post-app-3: {}".format(
146 count_post_app1, count_post_app2, count_post_app3
147 )
148 )
149 if buffer_sharing_policy == lttngtest.lttngctl.BufferSharingPolicy.PerUID:
150 tap.test(
151 (count_post_app1 == count_post_app2)
152 and (count_post_app2 == count_post_app3),
153 "Count of consumerd FDs in /dev/shm doesn't leak over several application invocations",
154 )
155 else:
156 tap.skip(
157 "Count of consumerds FDs in /dev/shm doesn't leak over several application invocations - no mechanism is available to guarantee buffer reclamation within a given time frame"
158 )
159
160
161 tap = lttngtest.TapGenerator(8)
162 for kill_relayd in [True, False]:
163 for buffer_sharing_policy in [
164 lttngtest.lttngctl.BufferSharingPolicy.PerUID,
165 lttngtest.lttngctl.BufferSharingPolicy.PerPID,
166 ]:
167 with lttngtest.test_environment(
168 log=tap.diagnostic, with_relayd=True, with_sessiond=True
169 ) as test_env:
170 test_fd_leak(tap, test_env, buffer_sharing_policy, kill_relayd)
171
172 sys.exit(0 if tap.is_successful else 1)
This page took 0.035773 seconds and 5 git commands to generate.