I noticed that urcu makes exactly _one_ attempt at using futexes
to avoid busy looping on synchronize_rcu. The attached patch instead
switches from busy waiting to futexes after RCU_QS_ACTIVE_ATTEMPTS.
To limit the amount of system calls, reading threads remember whether
they already had a quiescent state in this grace period; if so they were
already removed from the list, and can avoid signaling the futex.
Performance measured with rcutorture (nreaders: 10, nupdaters: 1,
duration: 10, median of nine runs):
RCU_QS_ACTIVE_ATTEMPTS == 100, no patch n_updates = 292
RCU_QS_ACTIVE_ATTEMPTS == 1, no patch n_updates = 290
RCU_QS_ACTIVE_ATTEMPTS == 100, with patch n_updates = 408
RCU_QS_ACTIVE_ATTEMPTS == 1, with patch n_updates = 404
(the first two cases are obviously the same; the only change is
when the futex is used, but over many calls there is no difference).
This patch matches the update to the Promela model.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
*/
for (;;) {
wait_loops++;
*/
for (;;) {
wait_loops++;
- if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {
- uatomic_dec(&gp_futex);
+ if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS) {
+ uatomic_set(&gp_futex, -1);
+ /*
+ * Write futex before write waiting (the other side
+ * reads them in the opposite order).
+ */
+ cmm_smp_wmb();
+ cds_list_for_each_entry(index, ®istry, node) {
+ _CMM_STORE_SHARED(index->waiting, 1);
+ }
/* Write futex before read reader_gp */
cmm_smp_mb();
}
/* Write futex before read reader_gp */
cmm_smp_mb();
}
cds_list_for_each_entry_safe(index, tmp, ®istry, node) {
if (!rcu_gp_ongoing(&index->ctr))
cds_list_move(&index->node, &qsreaders);
}
if (cds_list_empty(®istry)) {
cds_list_for_each_entry_safe(index, tmp, ®istry, node) {
if (!rcu_gp_ongoing(&index->ctr))
cds_list_move(&index->node, &qsreaders);
}
if (cds_list_empty(®istry)) {
- if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {
+ if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS) {
/* Read reader_gp before write futex */
cmm_smp_mb();
uatomic_set(&gp_futex, 0);
}
break;
} else {
/* Read reader_gp before write futex */
cmm_smp_mb();
uatomic_set(&gp_futex, 0);
}
break;
} else {
- if (wait_loops == RCU_QS_ACTIVE_ATTEMPTS) {
+ if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS) {
wait_gp();
} else {
#ifndef HAS_INCOHERENT_CACHES
wait_gp();
} else {
#ifndef HAS_INCOHERENT_CACHES
unsigned long ctr;
/* Data used for registry */
struct cds_list_head node __attribute__((aligned(CAA_CACHE_LINE_SIZE)));
unsigned long ctr;
/* Data used for registry */
struct cds_list_head node __attribute__((aligned(CAA_CACHE_LINE_SIZE)));
*/
static inline void wake_up_gp(void)
{
*/
static inline void wake_up_gp(void)
{
- if (unlikely(uatomic_read(&gp_futex) == -1)) {
+ if (unlikely(_CMM_LOAD_SHARED(rcu_reader.waiting))) {
+ _CMM_STORE_SHARED(rcu_reader.waiting, 0);
+ cmm_smp_mb();
+ if (uatomic_read(&gp_futex) != -1)
+ return;
uatomic_set(&gp_futex, 0);
futex_noasync(&gp_futex, FUTEX_WAKE, 1,
NULL, NULL, 0);
uatomic_set(&gp_futex, 0);
futex_noasync(&gp_futex, FUTEX_WAKE, 1,
NULL, NULL, 0);