X-Git-Url: https://git.lttng.org/?p=urcu.git;a=blobdiff_plain;f=rculfhash.c;h=d22b44d25135b9a69a8faa7377ed8fb6fb72df16;hp=d04451fece7ac744cb0b894b1f2b5e3803b9785f;hb=0f5543cb1780acef35878646e6cdc966f1406c18;hpb=1644042068ae8de26c9f6b85dcb2a5cd0b0f7608 diff --git a/rculfhash.c b/rculfhash.c index d04451f..d22b44d 100644 --- a/rculfhash.c +++ b/rculfhash.c @@ -71,18 +71,18 @@ * (not visible to lookups anymore) before the RCU read-side critical * section held across removal ends. Furthermore, this ensures that * the node with "removed" flag set is removed from the linked-list - * before its memory is reclaimed. Only the thread which removal - * successfully set the "removed" flag (with a cmpxchg) into a node's - * next pointer is considered to have succeeded its removal (and thus - * owns the node to reclaim). Because we garbage-collect starting from - * an invariant node (the start-of-bucket bucket node) up to the - * "removed" node (or find a reverse-hash that is higher), we are sure - * that a successful traversal of the chain leads to a chain that is - * present in the linked-list (the start node is never removed) and - * that is does not contain the "removed" node anymore, even if - * concurrent delete/add operations are changing the structure of the - * list concurrently. - * - The add operation performs gargage collection of buckets if it + * before its memory is reclaimed. After setting the "removal" flag, + * only the thread which removal is the first to set the "removal + * owner" flag (with an xchg) into a node's next pointer is considered + * to have succeeded its removal (and thus owns the node to reclaim). + * Because we garbage-collect starting from an invariant node (the + * start-of-bucket bucket node) up to the "removed" node (or find a + * reverse-hash that is higher), we are sure that a successful + * traversal of the chain leads to a chain that is present in the + * linked-list (the start node is never removed) and that is does not + * contain the "removed" node anymore, even if concurrent delete/add + * operations are changing the structure of the list concurrently. + * - The add operation performs garbage collection of buckets if it * encounters nodes with removed flag set in the bucket where it wants * to add its new node. This ensures lock-freedom of add operation by * helping the remover unlink nodes from the list rather than to wait @@ -98,6 +98,33 @@ * hash table nodes. These tables are invariant after they are * populated into the hash table. * + * Linearizability Guarantees: + * + * To discuss these guarantees, we first define "read" operations as any + * of the following operations surrounded by an RCU read-side lock/unlock + * pair: + * - cds_lfht_lookup + * - cds_lfht_lookup followed by iteration with cds_lfht_next_duplicate + * - cds_lfht_first followed iteration with cds_lfht_next + * + * We define "write" operations as any of cds_lfht_add, + * cds_lfht_add_unique, cds_lfht_add_replace, cds_lfht_del. + * + * The following guarantees are offered by this hash table: + * + * A) "read" after "write" will always return the result of the latest + * write. + * B) "write" after "read" will never be returned by the read. + * C) It is guaranteed that after a grace period following a "del" and + * "replace" operation, no reference to the removed items exists in + * the hash table. + * D) Uniqueness guarantee: when using add_unique and/or add_replace to + * insert nodes into the table, if there was previously one node or + * less with the same key being inserted by one or more concurrent + * add_unique and/or add_replace, all concurrent "read" performed on + * the hash table are guaranteed to find one, and only one node with + * that key. + * * Bucket node tables: * * hash table hash table the last all bucket node tables @@ -150,12 +177,14 @@ */ #define _LGPL_SOURCE +#define _GNU_SOURCE #include #include #include #include #include #include +#include #include "config.h" #include @@ -201,7 +230,8 @@ */ #define REMOVED_FLAG (1UL << 0) #define BUCKET_FLAG (1UL << 1) -#define FLAGS_MASK ((1UL << 2) - 1) +#define REMOVAL_OWNER_FLAG (1UL << 2) +#define FLAGS_MASK ((1UL << 3) - 1) /* Value of the end pointer. Should not interact with flags. */ #define END_VALUE NULL @@ -405,7 +435,7 @@ unsigned int fls_u32(uint32_t x) } #endif -unsigned int fls_ulong(unsigned long x) +unsigned int cds_lfht_fls_ulong(unsigned long x) { #if (CAA_BITS_PER_LONG == 32) return fls_u32(x); @@ -418,7 +448,7 @@ unsigned int fls_ulong(unsigned long x) * Return the minimum order for which x <= (1UL << order). * Return -1 if x is 0. */ -int get_count_order_u32(uint32_t x) +int cds_lfht_get_count_order_u32(uint32_t x) { if (!x) return -1; @@ -430,12 +460,12 @@ int get_count_order_u32(uint32_t x) * Return the minimum order for which x <= (1UL << order). * Return -1 if x is 0. */ -int get_count_order_ulong(unsigned long x) +int cds_lfht_get_count_order_ulong(unsigned long x) { if (!x) return -1; - return fls_ulong(x - 1); + return cds_lfht_fls_ulong(x - 1); } static @@ -462,7 +492,7 @@ static void ht_init_nr_cpus_mask(void) * round up number of CPUs to next power of two, so we * can use & for modulo. */ - maxcpus = 1UL << get_count_order_ulong(maxcpus); + maxcpus = 1UL << cds_lfht_get_count_order_ulong(maxcpus); nr_cpus_mask = maxcpus - 1; } #else /* #if defined(HAVE_SYSCONF) */ @@ -605,7 +635,7 @@ void check_resize(struct cds_lfht *ht, unsigned long size, uint32_t chain_len) chain_len); if (chain_len >= CHAIN_LEN_RESIZE_THRESHOLD) cds_lfht_resize_lazy_grow(ht, size, - get_count_order_u32(chain_len - (CHAIN_LEN_TARGET - 1))); + cds_lfht_get_count_order_u32(chain_len - (CHAIN_LEN_TARGET - 1))); } static @@ -638,6 +668,18 @@ struct cds_lfht_node *flag_bucket(struct cds_lfht_node *node) return (struct cds_lfht_node *) (((unsigned long) node) | BUCKET_FLAG); } +static +int is_removal_owner(struct cds_lfht_node *node) +{ + return ((unsigned long) node) & REMOVAL_OWNER_FLAG; +} + +static +struct cds_lfht_node *flag_removal_owner(struct cds_lfht_node *node) +{ + return (struct cds_lfht_node *) (((unsigned long) node) | REMOVAL_OWNER_FLAG); +} + static struct cds_lfht_node *get_end(void) { @@ -739,7 +781,6 @@ void _cds_lfht_gc_bucket(struct cds_lfht_node *bucket, struct cds_lfht_node *nod new_next = clear_flag(next); (void) uatomic_cmpxchg(&iter_prev->next, iter, new_next); } - return; } static @@ -767,9 +808,9 @@ int _cds_lfht_replace(struct cds_lfht *ht, unsigned long size, */ return -ENOENT; } - assert(!is_bucket(old_next)); - assert(new_node != clear_flag(old_next)); - new_node->next = clear_flag(old_next); + assert(old_next == clear_flag(old_next)); + assert(new_node != old_next); + new_node->next = old_next; /* * Here is the whole trick for lock-free replace: we add * the replacement node _after_ the node we want to @@ -779,6 +820,9 @@ int _cds_lfht_replace(struct cds_lfht *ht, unsigned long size, * next pointer, they will either skip the old node due * to the removal flag and see the new node, or use * the old node, but will not see the new one. + * This is a replacement of a node with another node + * that has the same value: we are therefore not + * removing a value from the hash table. */ ret_next = uatomic_cmpxchg(&old_node->next, old_next, flag_removed(new_node)); @@ -805,6 +849,7 @@ int _cds_lfht_replace(struct cds_lfht *ht, unsigned long size, */ static void _cds_lfht_add(struct cds_lfht *ht, + unsigned long hash, cds_lfht_match_fct match, const void *key, unsigned long size, @@ -818,7 +863,7 @@ void _cds_lfht_add(struct cds_lfht *ht, assert(!is_bucket(node)); assert(!is_removed(node)); - bucket = lookup_bucket(ht, size, bit_reverse_ulong(node->reverse_hash)); + bucket = lookup_bucket(ht, size, hash); for (;;) { uint32_t chain_len = 0; @@ -914,10 +959,9 @@ end: static int _cds_lfht_del(struct cds_lfht *ht, unsigned long size, - struct cds_lfht_node *node, - int bucket_removal) + struct cds_lfht_node *node) { - struct cds_lfht_node *bucket, *next, *old; + struct cds_lfht_node *bucket, *next; if (!node) /* Return -ENOENT if asked to delete NULL node */ return -ENOENT; @@ -925,20 +969,25 @@ int _cds_lfht_del(struct cds_lfht *ht, unsigned long size, /* logically delete the node */ assert(!is_bucket(node)); assert(!is_removed(node)); - old = rcu_dereference(node->next); - do { - struct cds_lfht_node *new_next; + assert(!is_removal_owner(node)); - next = old; - if (caa_unlikely(is_removed(next))) - return -ENOENT; - if (bucket_removal) - assert(is_bucket(next)); - else - assert(!is_bucket(next)); - new_next = flag_removed(next); - old = uatomic_cmpxchg(&node->next, next, new_next); - } while (old != next); + /* + * We are first checking if the node had previously been + * logically removed (this check is not atomic with setting the + * logical removal flag). Return -ENOENT if the node had + * previously been removed. + */ + next = rcu_dereference(node->next); + if (caa_unlikely(is_removed(next))) + return -ENOENT; + assert(!is_bucket(next)); + /* + * We set the REMOVED_FLAG unconditionally. Note that there may + * be more than one concurrent thread setting this flag. + * Knowing which wins the race will be known after the garbage + * collection phase, stay tuned! + */ + uatomic_or(&node->next, REMOVED_FLAG); /* We performed the (logical) deletion. */ /* @@ -950,7 +999,23 @@ int _cds_lfht_del(struct cds_lfht *ht, unsigned long size, _cds_lfht_gc_bucket(bucket, node); assert(is_removed(rcu_dereference(node->next))); - return 0; + /* + * Last phase: atomically exchange node->next with a version + * having "REMOVAL_OWNER_FLAG" set. If the returned node->next + * pointer did _not_ have "REMOVAL_OWNER_FLAG" set, we now own + * the node and win the removal race. + * It is interesting to note that all "add" paths are forbidden + * to change the next pointer starting from the point where the + * REMOVED_FLAG is set, so here using a read, followed by a + * xchg() suffice to guarantee that the xchg() will ever only + * set the "REMOVAL_OWNER_FLAG" (or change nothing if the flag + * was already set). + */ + if (!is_removal_owner(uatomic_xchg(&node->next, + flag_removal_owner(node->next)))) + return 0; + else + return -ENOENT; } static @@ -986,7 +1051,7 @@ void partition_resize_helper(struct cds_lfht *ht, unsigned long i, } else { nr_threads = 1; } - partition_len = len >> get_count_order_ulong(nr_threads); + partition_len = len >> cds_lfht_get_count_order_ulong(nr_threads); work = calloc(nr_threads, sizeof(*work)); assert(work); for (thread = 0; thread < nr_threads; thread++) { @@ -1032,7 +1097,7 @@ void init_table_populate_partition(struct cds_lfht *ht, unsigned long i, dbg_printf("init populate: order %lu index %lu hash %lu\n", i, j, j); new_node->reverse_hash = bit_reverse_ulong(j); - _cds_lfht_add(ht, NULL, NULL, size, new_node, NULL, 1); + _cds_lfht_add(ht, j, NULL, NULL, size, new_node, NULL, 1); } ht->flavor->read_unlock(); } @@ -1124,13 +1189,15 @@ void remove_table_partition(struct cds_lfht *ht, unsigned long i, assert(i > MIN_TABLE_ORDER); ht->flavor->read_lock(); for (j = size + start; j < size + start + len; j++) { - struct cds_lfht_node *fini_node = bucket_at(ht, j); + struct cds_lfht_node *fini_bucket = bucket_at(ht, j); + struct cds_lfht_node *parent_bucket = bucket_at(ht, j - size); assert(j >= size && j < (size << 1)); dbg_printf("remove entry: order %lu index %lu hash %lu\n", i, j, j); - fini_node->reverse_hash = bit_reverse_ulong(j); - (void) _cds_lfht_del(ht, size, fini_node, 1); + /* Set the REMOVED_FLAG to freeze the ->next for gc */ + uatomic_or(&fini_bucket->next, REMOVED_FLAG); + _cds_lfht_gc_bucket(parent_bucket, fini_bucket); } ht->flavor->read_unlock(); } @@ -1221,7 +1288,7 @@ void cds_lfht_create_bucket(struct cds_lfht *ht, unsigned long size) node->next = flag_bucket(get_end()); node->reverse_hash = 0; - for (order = 1; order < get_count_order_ulong(size) + 1; order++) { + for (order = 1; order < cds_lfht_get_count_order_ulong(size) + 1; order++) { len = 1UL << (order - 1); cds_lfht_alloc_bucket_table(ht, order); @@ -1319,7 +1386,7 @@ struct cds_lfht *_cds_lfht_new(unsigned long init_size, alloc_split_items_count(ht); /* this mutex should not nest in read-side C.S. */ pthread_mutex_init(&ht->resize_mutex, NULL); - order = get_count_order_ulong(init_size); + order = cds_lfht_get_count_order_ulong(init_size); ht->resize_target = 1UL << order; cds_lfht_create_bucket(ht, 1UL << order); ht->size = 1UL << order; @@ -1434,9 +1501,9 @@ void cds_lfht_add(struct cds_lfht *ht, unsigned long hash, { unsigned long size; - node->reverse_hash = bit_reverse_ulong((unsigned long) hash); + node->reverse_hash = bit_reverse_ulong(hash); size = rcu_dereference(ht->size); - _cds_lfht_add(ht, NULL, NULL, size, node, NULL, 0); + _cds_lfht_add(ht, hash, NULL, NULL, size, node, NULL, 0); ht_count_add(ht, size, hash); } @@ -1449,9 +1516,9 @@ struct cds_lfht_node *cds_lfht_add_unique(struct cds_lfht *ht, unsigned long size; struct cds_lfht_iter iter; - node->reverse_hash = bit_reverse_ulong((unsigned long) hash); + node->reverse_hash = bit_reverse_ulong(hash); size = rcu_dereference(ht->size); - _cds_lfht_add(ht, match, key, size, node, &iter, 0); + _cds_lfht_add(ht, hash, match, key, size, node, &iter, 0); if (iter.node == node) ht_count_add(ht, size, hash); return iter.node; @@ -1466,10 +1533,10 @@ struct cds_lfht_node *cds_lfht_add_replace(struct cds_lfht *ht, unsigned long size; struct cds_lfht_iter iter; - node->reverse_hash = bit_reverse_ulong((unsigned long) hash); + node->reverse_hash = bit_reverse_ulong(hash); size = rcu_dereference(ht->size); for (;;) { - _cds_lfht_add(ht, match, key, size, node, &iter, 0); + _cds_lfht_add(ht, hash, match, key, size, node, &iter, 0); if (iter.node == node) { ht_count_add(ht, size, hash); return NULL; @@ -1480,30 +1547,46 @@ struct cds_lfht_node *cds_lfht_add_replace(struct cds_lfht *ht, } } -int cds_lfht_replace(struct cds_lfht *ht, struct cds_lfht_iter *old_iter, +int cds_lfht_replace(struct cds_lfht *ht, + struct cds_lfht_iter *old_iter, + unsigned long hash, + cds_lfht_match_fct match, + const void *key, struct cds_lfht_node *new_node) { unsigned long size; + new_node->reverse_hash = bit_reverse_ulong(hash); + if (!old_iter->node) + return -ENOENT; + if (caa_unlikely(old_iter->node->reverse_hash != new_node->reverse_hash)) + return -EINVAL; + if (caa_unlikely(!match(old_iter->node, key))) + return -EINVAL; size = rcu_dereference(ht->size); return _cds_lfht_replace(ht, size, old_iter->node, old_iter->next, new_node); } -int cds_lfht_del(struct cds_lfht *ht, struct cds_lfht_iter *iter) +int cds_lfht_del(struct cds_lfht *ht, struct cds_lfht_node *node) { unsigned long size, hash; int ret; size = rcu_dereference(ht->size); - ret = _cds_lfht_del(ht, size, iter->node, 0); + ret = _cds_lfht_del(ht, size, node); if (!ret) { - hash = bit_reverse_ulong(iter->node->reverse_hash); + hash = bit_reverse_ulong(node->reverse_hash); ht_count_del(ht, size, hash); } return ret; } +int cds_lfht_is_node_deleted(struct cds_lfht_node *node) +{ + return is_removed(rcu_dereference(node->next)); +} + static int cds_lfht_delete_bucket(struct cds_lfht *ht) { @@ -1531,7 +1614,7 @@ int cds_lfht_delete_bucket(struct cds_lfht *ht) assert(is_bucket(node->next)); } - for (order = get_count_order_ulong(size); (long)order >= 0; order--) + for (order = cds_lfht_get_count_order_ulong(size); (long)order >= 0; order--) cds_lfht_free_bucket_table(ht, order); return 0; @@ -1563,11 +1646,10 @@ int cds_lfht_destroy(struct cds_lfht *ht, pthread_attr_t **attr) void cds_lfht_count_nodes(struct cds_lfht *ht, long *approx_before, unsigned long *count, - unsigned long *removed, long *approx_after) { struct cds_lfht_node *node, *next; - unsigned long nr_bucket = 0; + unsigned long nr_bucket = 0, nr_removed = 0; *approx_before = 0; if (ht->split_count) { @@ -1580,7 +1662,6 @@ void cds_lfht_count_nodes(struct cds_lfht *ht, } *count = 0; - *removed = 0; /* Count non-bucket nodes in the table */ node = bucket_at(ht, 0); @@ -1588,7 +1669,7 @@ void cds_lfht_count_nodes(struct cds_lfht *ht, next = rcu_dereference(node->next); if (is_removed(next)) { if (!is_bucket(next)) - (*removed)++; + (nr_removed)++; else (nr_bucket)++; } else if (!is_bucket(next)) @@ -1597,6 +1678,7 @@ void cds_lfht_count_nodes(struct cds_lfht *ht, (nr_bucket)++; node = clear_flag(next); } while (!is_end(node)); + dbg_printf("number of logically removed nodes: %lu\n", nr_removed); dbg_printf("number of bucket nodes: %lu\n", nr_bucket); *approx_after = 0; if (ht->split_count) { @@ -1616,8 +1698,8 @@ void _do_cds_lfht_grow(struct cds_lfht *ht, { unsigned long old_order, new_order; - old_order = get_count_order_ulong(old_size); - new_order = get_count_order_ulong(new_size); + old_order = cds_lfht_get_count_order_ulong(old_size); + new_order = cds_lfht_get_count_order_ulong(new_size); dbg_printf("resize from %lu (order %lu) to %lu (order %lu) buckets\n", old_size, old_order, new_size, new_order); assert(new_size > old_size); @@ -1632,8 +1714,8 @@ void _do_cds_lfht_shrink(struct cds_lfht *ht, unsigned long old_order, new_order; new_size = max(new_size, MIN_TABLE_SIZE); - old_order = get_count_order_ulong(old_size); - new_order = get_count_order_ulong(new_size); + old_order = cds_lfht_get_count_order_ulong(old_size); + new_order = cds_lfht_get_count_order_ulong(new_size); dbg_printf("resize from %lu (order %lu) to %lu (order %lu) buckets\n", old_size, old_order, new_size, new_order); assert(new_size < old_size); @@ -1727,6 +1809,11 @@ void __cds_lfht_resize_lazy_launch(struct cds_lfht *ht) return; } work = malloc(sizeof(*work)); + if (work == NULL) { + dbg_printf("error allocating resize work, bailing out\n"); + uatomic_dec(&ht->in_progress_resize); + return; + } work->ht = ht; ht->flavor->update_call_rcu(&work->head, do_resize_cb); CMM_STORE_SHARED(ht->resize_initiated, 1);