We noticed that with this kind of scenario:
- application using urcu-mb, urcu-membarrier, urcu-signal, or urcu-bp,
- long RCU read-side critical sections, caused by e.g. long network I/O
system calls,
- other short lived RCU critical sections running in other threads,
- very frequent invocation of call_rcu to enqueue callbacks,
lead to abnormally high CPU usage within synchronize_rcu() in the
call_rcu worker threads.
Inspection of the code gives us the answer: in urcu.c, we expect that if
we need to wait on a futex (wait_gp()), we expect to be able to end the
grace period within the next loop, having been notified by a
rcu_read_unlock(). However, this is not always the case: we can very
well be awakened by a rcu_read_unlock() executed on a thread running
short-lived RCU read-side critical sections, while the long-running RCU
read-side C.S. is still active. We end up in a situation where we
busy-wait for a very long time, because the counter is !=
RCU_QS_ACTIVE_ATTEMPTS until a 32-bit overflow happens (or more likely,
until we complete the grace period). We need to change the wait_loops ==
RCU_QS_ACTIVE_ATTEMPTS check into an inequality to use wait_gp() for
every attempts beyond RCU_QS_ACTIVE_ATTEMPTS loops.
urcu-bp.c also has this issue. Moreover, it uses usleep() rather than
poll() when dealing with long-running RCU read-side critical sections.
Turn the usleep 1000us (1ms) into a poll of 10ms. One of the advantage
of using poll() rather than usleep() is that it does not interact with
SIGALRM.
urcu-qsbr.c already checks for wait_loops >= RCU_QS_ACTIVE_ATTEMPTS, so
it is not affected by this issue.
Looking into these loops, however, shows that overflow of the loop
counter, although unlikely, would bring us back to a situation of high
cpu usage (a negative value well below RCU_QS_ACTIVE_ATTEMPTS).
Therefore, change the counter behavior so it stops incrementing when it
reaches RCU_QS_ACTIVE_ATTEMPTS, to eliminate overflow.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
-/* Sleep delay in us */
-#define RCU_SLEEP_DELAY 1000
+/* Sleep delay in ms */
+#define RCU_SLEEP_DELAY_MS 10
#define INIT_NR_THREADS 8
#define ARENA_INIT_ALLOC \
sizeof(struct registry_chunk) \
#define INIT_NR_THREADS 8
#define ARENA_INIT_ALLOC \
sizeof(struct registry_chunk) \
struct cds_list_head *cur_snap_readers,
struct cds_list_head *qsreaders)
{
struct cds_list_head *cur_snap_readers,
struct cds_list_head *qsreaders)
{
+ unsigned int wait_loops = 0;
struct rcu_reader *index, *tmp;
/*
struct rcu_reader *index, *tmp;
/*
* rcu_gp.ctr value.
*/
for (;;) {
* rcu_gp.ctr value.
*/
for (;;) {
+ if (wait_loops < RCU_QS_ACTIVE_ATTEMPTS)
+ wait_loops++;
+
cds_list_for_each_entry_safe(index, tmp, input_readers, node) {
switch (rcu_reader_state(&index->ctr)) {
case RCU_READER_ACTIVE_CURRENT:
cds_list_for_each_entry_safe(index, tmp, input_readers, node) {
switch (rcu_reader_state(&index->ctr)) {
case RCU_READER_ACTIVE_CURRENT:
if (cds_list_empty(input_readers)) {
break;
} else {
if (cds_list_empty(input_readers)) {
break;
} else {
- if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS)
- usleep(RCU_SLEEP_DELAY);
+ if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS)
+ (void) poll(NULL, 0, RCU_SLEEP_DELAY_MS);
struct cds_list_head *cur_snap_readers,
struct cds_list_head *qsreaders)
{
struct cds_list_head *cur_snap_readers,
struct cds_list_head *qsreaders)
{
+ unsigned int wait_loops = 0;
struct rcu_reader *index, *tmp;
/*
struct rcu_reader *index, *tmp;
/*
* current rcu_gp.ctr value.
*/
for (;;) {
* current rcu_gp.ctr value.
*/
for (;;) {
if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS) {
uatomic_set(&rcu_gp.futex, -1);
/*
if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS) {
uatomic_set(&rcu_gp.futex, -1);
/*
}
/* Write futex before read reader_gp */
cmm_smp_mb();
}
/* Write futex before read reader_gp */
cmm_smp_mb();
+ } else {
+ wait_loops++;
}
cds_list_for_each_entry_safe(index, tmp, input_readers, node) {
switch (rcu_reader_state(&index->ctr)) {
}
cds_list_for_each_entry_safe(index, tmp, input_readers, node) {
switch (rcu_reader_state(&index->ctr)) {
/*
* If a reader is really non-cooperative and refuses to commit its
* rcu_active_readers count to memory (there is no barrier in the reader
/*
* If a reader is really non-cooperative and refuses to commit its
* rcu_active_readers count to memory (there is no barrier in the reader
- * per-se), kick it after a few loops waiting for it.
+ * per-se), kick it after 10 loops waiting for it.
-#define KICK_READER_LOOPS 10000
+#define KICK_READER_LOOPS 10
/*
* Active attempts to check for reader Q.S. before calling futex().
/*
* Active attempts to check for reader Q.S. before calling futex().
struct cds_list_head *cur_snap_readers,
struct cds_list_head *qsreaders)
{
struct cds_list_head *cur_snap_readers,
struct cds_list_head *qsreaders)
{
+ unsigned int wait_loops = 0;
struct rcu_reader *index, *tmp;
struct rcu_reader *index, *tmp;
+#ifdef HAS_INCOHERENT_CACHES
+ unsigned int wait_gp_loops = 0;
+#endif /* HAS_INCOHERENT_CACHES */
/*
* Wait for each thread URCU_TLS(rcu_reader).ctr to either
/*
* Wait for each thread URCU_TLS(rcu_reader).ctr to either
* rcu_gp.ctr value.
*/
for (;;) {
* rcu_gp.ctr value.
*/
for (;;) {
- wait_loops++;
- if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {
+ if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS) {
uatomic_dec(&rcu_gp.futex);
/* Write futex before read reader_gp */
smp_mb_master(RCU_MB_GROUP);
uatomic_dec(&rcu_gp.futex);
/* Write futex before read reader_gp */
smp_mb_master(RCU_MB_GROUP);
+ } else {
+ wait_loops++;
}
cds_list_for_each_entry_safe(index, tmp, input_readers, node) {
}
cds_list_for_each_entry_safe(index, tmp, input_readers, node) {
#ifndef HAS_INCOHERENT_CACHES
if (cds_list_empty(input_readers)) {
#ifndef HAS_INCOHERENT_CACHES
if (cds_list_empty(input_readers)) {
- if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {
+ if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS) {
/* Read reader_gp before write futex */
smp_mb_master(RCU_MB_GROUP);
uatomic_set(&rcu_gp.futex, 0);
}
break;
} else {
/* Read reader_gp before write futex */
smp_mb_master(RCU_MB_GROUP);
uatomic_set(&rcu_gp.futex, 0);
}
break;
} else {
- if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS)
+ if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS)
wait_gp();
else
caa_cpu_relax();
wait_gp();
else
caa_cpu_relax();
* for too long.
*/
if (cds_list_empty(input_readers)) {
* for too long.
*/
if (cds_list_empty(input_readers)) {
- if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {
+ if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS) {
/* Read reader_gp before write futex */
smp_mb_master(RCU_MB_GROUP);
uatomic_set(&rcu_gp.futex, 0);
}
break;
} else {
/* Read reader_gp before write futex */
smp_mb_master(RCU_MB_GROUP);
uatomic_set(&rcu_gp.futex, 0);
}
break;
} else {
- switch (wait_loops) {
- case RCU_QS_ACTIVE_ATTEMPTS:
- wait_gp();
- break; /* only escape switch */
- case KICK_READER_LOOPS:
+ if (wait_gp_loops == KICK_READER_LOOPS) {
smp_mb_master(RCU_MB_GROUP);
smp_mb_master(RCU_MB_GROUP);
- wait_loops = 0;
- break; /* only escape switch */
- default:
+ wait_gp_loops = 0;
+ }
+ if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS) {
+ wait_gp();
+ wait_gp_loops++;
+ } else {