radix-tree: improve multiorder iterators

This fixes several interlinked problems with the iterators in the
presence of multiorder entries.

1. radix_tree_iter_next() would only advance by one slot, which would
   result in the iterators returning the same entry more than once if
   there were sibling entries.

2. radix_tree_next_slot() could return an internal pointer instead of
   a user pointer if a tagged multiorder entry was immediately followed by
   an entry of lower order.

3. radix_tree_next_slot() expanded to a lot more code than it used to
   when multiorder support was compiled in.  And I wasn't comfortable with
   entry_to_node() being in a header file.

Fixing radix_tree_iter_next() for the presence of sibling entries
necessarily involves examining the contents of the radix tree, so we now
need to pass 'slot' to radix_tree_iter_next(), and we need to change the
calling convention so it is called *before* dropping the lock which
protects the tree.  Also rename it to radix_tree_iter_resume(), as some
people thought it was necessary to call radix_tree_iter_next() each time
around the loop.

radix_tree_next_slot() becomes closer to how it looked before multiorder
support was introduced.  It only checks to see if the next entry in the
chunk is a sibling entry or a pointer to a node; this should be rare
enough that handling this case out of line is not a performance impact
(and such impact is amortised by the fact that the entry we just
processed was a multiorder entry).  Also, radix_tree_next_slot() used to
force a new chunk lookup for untagged entries, which is more expensive
than the out of line sibling entry skipping.

Link: http://lkml.kernel.org/r/1480369871-5271-55-git-send-email-mawilcox@linuxonhyperv.com
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Matthew Wilcox 2016-12-14 15:08:49 -08:00 committed by Linus Torvalds
parent b35df27a39
commit 148deab223
9 changed files with 188 additions and 85 deletions

View file

@ -69,6 +69,11 @@ struct radix_tree_preload {
};
static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
static inline struct radix_tree_node *entry_to_node(void *ptr)
{
return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
}
static inline void *node_to_entry(void *ptr)
{
return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
@ -1104,6 +1109,120 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter,
#endif
}
/* Construct iter->tags bit-mask from node->tags[tag] array */
static void set_iter_tags(struct radix_tree_iter *iter,
struct radix_tree_node *node, unsigned offset,
unsigned tag)
{
unsigned tag_long = offset / BITS_PER_LONG;
unsigned tag_bit = offset % BITS_PER_LONG;
iter->tags = node->tags[tag][tag_long] >> tag_bit;
/* This never happens if RADIX_TREE_TAG_LONGS == 1 */
if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
/* Pick tags from next element */
if (tag_bit)
iter->tags |= node->tags[tag][tag_long + 1] <<
(BITS_PER_LONG - tag_bit);
/* Clip chunk size, here only BITS_PER_LONG tags */
iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG);
}
}
#ifdef CONFIG_RADIX_TREE_MULTIORDER
static void **skip_siblings(struct radix_tree_node **nodep,
void **slot, struct radix_tree_iter *iter)
{
void *sib = node_to_entry(slot - 1);
while (iter->index < iter->next_index) {
*nodep = rcu_dereference_raw(*slot);
if (*nodep && *nodep != sib)
return slot;
slot++;
iter->index = __radix_tree_iter_add(iter, 1);
iter->tags >>= 1;
}
*nodep = NULL;
return NULL;
}
void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
unsigned flags)
{
unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
struct radix_tree_node *node = rcu_dereference_raw(*slot);
slot = skip_siblings(&node, slot, iter);
while (radix_tree_is_internal_node(node)) {
unsigned offset;
unsigned long next_index;
if (node == RADIX_TREE_RETRY)
return slot;
node = entry_to_node(node);
iter->shift = node->shift;
if (flags & RADIX_TREE_ITER_TAGGED) {
offset = radix_tree_find_next_bit(node, tag, 0);
if (offset == RADIX_TREE_MAP_SIZE)
return NULL;
slot = &node->slots[offset];
iter->index = __radix_tree_iter_add(iter, offset);
set_iter_tags(iter, node, offset, tag);
node = rcu_dereference_raw(*slot);
} else {
offset = 0;
slot = &node->slots[0];
for (;;) {
node = rcu_dereference_raw(*slot);
if (node)
break;
slot++;
offset++;
if (offset == RADIX_TREE_MAP_SIZE)
return NULL;
}
iter->index = __radix_tree_iter_add(iter, offset);
}
if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0))
goto none;
next_index = (iter->index | shift_maxindex(iter->shift)) + 1;
if (next_index < iter->next_index)
iter->next_index = next_index;
}
return slot;
none:
iter->next_index = 0;
return NULL;
}
EXPORT_SYMBOL(__radix_tree_next_slot);
#else
static void **skip_siblings(struct radix_tree_node **nodep,
void **slot, struct radix_tree_iter *iter)
{
return slot;
}
#endif
void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter)
{
struct radix_tree_node *node;
slot++;
iter->index = __radix_tree_iter_add(iter, 1);
node = rcu_dereference_raw(*slot);
skip_siblings(&node, slot, iter);
iter->next_index = iter->index;
iter->tags = 0;
return NULL;
}
EXPORT_SYMBOL(radix_tree_iter_resume);
/**
* radix_tree_next_chunk - find next chunk of slots for iteration
*
@ -1191,23 +1310,8 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
iter->next_index = (index | node_maxindex(node)) + 1;
__set_iter_shift(iter, node->shift);
/* Construct iter->tags bit-mask from node->tags[tag] array */
if (flags & RADIX_TREE_ITER_TAGGED) {
unsigned tag_long, tag_bit;
tag_long = offset / BITS_PER_LONG;
tag_bit = offset % BITS_PER_LONG;
iter->tags = node->tags[tag][tag_long] >> tag_bit;
/* This never happens if RADIX_TREE_TAG_LONGS == 1 */
if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
/* Pick tags from next element */
if (tag_bit)
iter->tags |= node->tags[tag][tag_long + 1] <<
(BITS_PER_LONG - tag_bit);
/* Clip chunk size, here only BITS_PER_LONG tags */
iter->next_index = index + BITS_PER_LONG;
}
}
if (flags & RADIX_TREE_ITER_TAGGED)
set_iter_tags(iter, node, offset, tag);
return node->slots + offset;
}