Fix: condition: buffer-usage: use double instead of fixed point
authorFrancis Deslauriers <francis.deslauriers@efficios.com>
Fri, 23 Apr 2021 18:45:31 +0000 (14:45 -0400)
committerJérémie Galarneau <jeremie.galarneau@efficios.com>
Wed, 28 Apr 2021 21:02:57 +0000 (17:02 -0400)
Issue observed
==============

When running the test_notification_ust_buffer_usage test on x86
(32 bit), the session daemon and test client both crash. The session
daemon dies while attempting to lock a NULL client list during the
execution of an enqueued action in the action executor.

See the following backtrace:

 #0  0xf7c6c756 in __GI___pthread_mutex_lock (mutex=0x0) at ../nptl/pthread_mutex_lock.c:67
 #1  0x565afe96 in notification_client_list_send_evaluation (client_list=0x0, trigger=0xf0f225e0, evaluation=0xf330c830, source_object_creds=0xf330e5cc, client_report=0x565cf81b <client_handle_transmission_status>, user_data=0xf330c320) at notification-thread-events.c:4372
 #2  0x565cfb41 in action_executor_notify_handler (executor=0xf330c320, work_item=0xf330e5b0, item=0xf330c7b0) at action-executor.c:269
 #3  0x565d1a58 in action_executor_generic_handler (executor=0xf330c320, work_item=0xf330e5b0, item=0xf330c7b0) at action-executor.c:696
 #4  0x565d1b7f in action_work_item_execute (executor=0xf330c320, work_item=0xf330e5b0) at action-executor.c:715
 #5  0x565d212f in action_executor_thread (_data=0xf330c320) at action-executor.c:797
 #6  0x565b9d0e in launch_thread (data=0xf330c390) at thread.c:66
 #7  0xf7c69fd2 in start_thread (arg=<optimized out>) at pthread_create.c:486
 #8  0xf7b7f6d6 in clone () at ../sysdeps/unix/sysv/linux/i386/clone.S:108

This crash causes an assertion to fail in the test client; checking for
data pending was not expected to return a negative value. In this case,
the negative return value is justified as it is -LTTNG_ERR_NO_SESSIOND.

Cause
=====

Equipped with coffee, a debugger, and a healthy dose of print
statements, it appeared that the following was taking place:

- Register a trigger (T1): high buffer usage (0.99) -> notify (succeeds)
- Subscribe to high buffer usage (0.99) notifications (succeeds)
- Subscribe to high buffer usage (0.99) notifications
  (fails duplicate, expected)
- Unregister trigger (fails unexpectedly)
- Notification client destroys its channel, causing the condition to be
  unsubscribed-from

- Another test registers a trigger (T2): high buffer usage (0.90) ->
  notify (succeeds)
- Session daemon evaluates a channel sample against T1's condition,
  which evaluates to true and produces an "evaluation" to send to
  clients
- The client list associated to T1's condition is not found (but this
  isn't checked)
- An action executor work item is queued to run T1's actions (notify),
  but without a client list, resulting in the crash when it is executed.

We could confirm that the client list associated to T1's condition was
created and never destroyed making the failure to find it rather
puzzling.

It turns out that the hash of T1's condition did not match the hash of
the client list's condition. This is unexpected as both conditions are
copies of one another.

It turns out that, on x86, the scheme being used to transmit the
condition's buffer usage threshold floating point value is not compiled
to numerically stable code. Serializing such a buffer condition and
creating it from the resulting payload in a loop showed that the
threshold value gradually drifted. This isn't the case on the other
architectures we support.

On x86-64, gcc makes use of SSE instructions to perform the conversion
to an integral value (with double precision). However, on x86, it makes
use of the x87 fpu stack instructions which carry 80-bit of precision
internally, resulting in a loss of precision as the value is
transformed, back and forth, between 80-bit to double precision
representations.

Solution
========

Since conditions are not carried between hosts (only between clients
and the session daemon), a fixed-point conversion scheme is unnecessary.
The 'double' value provided by the client is carried directly which
bypasses the problem completely.

Drawbacks
=========

None.

Signed-off-by: Francis Deslauriers <francis.deslauriers@efficios.com>
Signed-off-by: Jérémie Galarneau <jeremie.galarneau@efficios.com>
Change-Id: Ie524e7362626406327f4f56e1dba5c8cf469df31

include/lttng/condition/buffer-usage-internal.h
src/bin/lttng-sessiond/condition-internal.c
src/common/conditions/buffer-usage.c

index 8ddce350576ff5a5bf536d2e533f3d48a6032a10..be4eade5e5ceaeb3522ade0eaab318b26c9a6485 100644 (file)
@@ -35,13 +35,8 @@ struct lttng_condition_buffer_usage {
 
 struct lttng_condition_buffer_usage_comm {
        uint8_t threshold_set_in_bytes;
-       /*
-        * Expressed in bytes if "threshold_set_in_bytes" is not 0.
-        * Otherwise, it is expressed a ratio in the interval [0.0, 1.0]
-        * that is mapped to the range on a 32-bit unsigned integer.
-        * The ratio is obtained by (threshold / UINT32_MAX).
-        */
-       uint32_t threshold;
+       uint64_t threshold_bytes;
+       double threshold_ratio;
        /* Both lengths include the trailing \0. */
        uint32_t session_name_len;
        uint32_t channel_name_len;
index c86bf540106031b674dde19c7931ac681f719be8..fb9a5c7540fe6bd63aef8525561b25d2b61bd2a2 100644 (file)
@@ -44,10 +44,7 @@ unsigned long lttng_condition_buffer_usage_hash(
                                lttng_ht_seed);
        }
        if (condition->threshold_ratio.set) {
-               uint64_t val;
-
-               val = condition->threshold_ratio.value * (double) UINT32_MAX;
-               hash ^= hash_key_u64(&val, lttng_ht_seed);
+               hash ^= hash_key_u64(&condition->threshold_ratio.value, lttng_ht_seed);
        } else if (condition->threshold_bytes.set) {
                uint64_t val;
 
index 5f68608001a43d3559d527541c974e8edc0eef73..54affe765d682378649d68fb3e20c2721bd5c4aa 100644 (file)
        lttng_condition_get_type(condition) == LTTNG_CONDITION_TYPE_BUFFER_USAGE_HIGH   \
        )
 
-static
-double fixed_to_double(uint32_t val)
-{
-       return (double) val / (double) UINT32_MAX;
-}
-
-static
-uint64_t double_to_fixed(double val)
-{
-       return (val * (double) UINT32_MAX);
-}
-
 static
 bool is_usage_evaluation(const struct lttng_evaluation *evaluation)
 {
@@ -121,17 +109,9 @@ int lttng_condition_buffer_usage_serialize(
        usage_comm.domain_type = (int8_t) usage->domain.type;
 
        if (usage->threshold_bytes.set) {
-               usage_comm.threshold = usage->threshold_bytes.value;
+               usage_comm.threshold_bytes = usage->threshold_bytes.value;
        } else {
-               uint64_t val = double_to_fixed(
-                               usage->threshold_ratio.value);
-
-               if (val > UINT32_MAX) {
-                       /* overflow. */
-                       ret = -1;
-                       goto end;
-               }
-               usage_comm.threshold = val;
+               usage_comm.threshold_ratio = usage->threshold_ratio.value;
        }
 
        ret = lttng_dynamic_buffer_append(&payload->buffer, &usage_comm,
@@ -285,11 +265,10 @@ ssize_t init_condition_from_payload(struct lttng_condition *condition,
 
        if (condition_comm->threshold_set_in_bytes) {
                status = lttng_condition_buffer_usage_set_threshold(condition,
-                               condition_comm->threshold);
+                               condition_comm->threshold_bytes);
        } else {
                status = lttng_condition_buffer_usage_set_threshold_ratio(
-                               condition,
-                               fixed_to_double(condition_comm->threshold));
+                               condition, condition_comm->threshold_ratio);
        }
 
        if (status != LTTNG_CONDITION_STATUS_OK) {
This page took 0.027288 seconds and 4 git commands to generate.