From 6f9c07be9d020489326098801f0661f754c7c865 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 5 Sep 2022 16:08:20 -0700 Subject: lib/cpumask: add FORCE_NR_CPUS config option The size of cpumasks is hard-limited by compile-time parameter NR_CPUS, but defined at boot-time when kernel parses ACPI/DT tables, and stored in nr_cpu_ids. In many practical cases, number of CPUs for a target is known at compile time, and can be provided with NR_CPUS. In that case, compiler may be instructed to rely on NR_CPUS as on actual number of CPUs, not an upper limit. It allows to optimize many cpumask routines and significantly shrink size of the kernel image. This patch adds FORCE_NR_CPUS option to teach the compiler to rely on NR_CPUS and enable corresponding optimizations. If FORCE_NR_CPUS=y, kernel will not set nr_cpu_ids at boot, but only check that the actual number of possible CPUs is equal to NR_CPUS, and WARN if that doesn't hold. The new option is especially useful in embedded applications because kernel configurations are unique for each SoC, the number of CPUs is constant and known well, and memory limitations are typically harder. For my 4-CPU ARM64 build with NR_CPUS=4, FORCE_NR_CPUS=y saves 46KB: add/remove: 3/4 grow/shrink: 46/729 up/down: 652/-46952 (-46300) Signed-off-by: Yury Norov --- lib/Kconfig | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'lib') diff --git a/lib/Kconfig b/lib/Kconfig index dc1ab2ed1dc6..77ead982c8b9 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -527,6 +527,15 @@ config CPUMASK_OFFSTACK them on the stack. This is a bit more expensive, but avoids stack overflow. +config FORCE_NR_CPUS + bool "NR_CPUS is set to an actual number of CPUs" + depends on SMP + help + Say Yes if you have NR_CPUS set to an actual number of possible + CPUs in your system, not to a default value. This forces the core + code to rely on compile-time value and optimize kernel routines + better. + config CPU_RMAP bool depends on SMP -- cgit From 58414bbb58a8b49af20e3accae56f6f8344b2424 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 14 Sep 2022 19:07:27 -0700 Subject: lib/find_bit: introduce FIND_FIRST_BIT() macro Now that we have many flavors of find_first_bit(), and expect even more, it's better to have one macro that generates optimal code for all and makes maintaining of slightly different functions simpler. The logic common to all versions is moved to the new macro, and all the flavors are generated by providing an FETCH macro-parameter, like in this example: #define FIND_FIRST_BIT(FETCH, MUNGE, size) ... find_first_ornot_and_bit(addr1, addr2, addr3, size) { return FIND_FIRST_BIT(addr1[idx] | ~addr2[idx] & addr3[idx], /* nop */, size); } The FETCH may be of any complexity, as soon as it only refers the bitmap(s) and an iterator idx. MUNGE is here to support _le code generation for BE builds. May be empty. Suggested-by: Linus Torvalds Reviewed-by: Valentin Schneider Signed-off-by: Yury Norov --- lib/find_bit.c | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) (limited to 'lib') diff --git a/lib/find_bit.c b/lib/find_bit.c index 1b8e4b2a9cba..894b656f6836 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -19,6 +19,27 @@ #include #include +/* + * Common helper for find_bit() function family + * @FETCH: The expression that fetches and pre-processes each word of bitmap(s) + * @MUNGE: The expression that post-processes a word containing found bit (may be empty) + * @size: The bitmap size in bits + */ +#define FIND_FIRST_BIT(FETCH, MUNGE, size) \ +({ \ + unsigned long idx, val, sz = (size); \ + \ + for (idx = 0; idx * BITS_PER_LONG < sz; idx++) { \ + val = (FETCH); \ + if (val) { \ + sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(val)), sz); \ + break; \ + } \ + } \ + \ + sz; \ +}) + #if !defined(find_next_bit) || !defined(find_next_zero_bit) || \ !defined(find_next_bit_le) || !defined(find_next_zero_bit_le) || \ !defined(find_next_and_bit) @@ -77,14 +98,7 @@ EXPORT_SYMBOL(_find_next_bit); */ unsigned long _find_first_bit(const unsigned long *addr, unsigned long size) { - unsigned long idx; - - for (idx = 0; idx * BITS_PER_LONG < size; idx++) { - if (addr[idx]) - return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size); - } - - return size; + return FIND_FIRST_BIT(addr[idx], /* nop */, size); } EXPORT_SYMBOL(_find_first_bit); #endif @@ -97,15 +111,7 @@ unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { - unsigned long idx, val; - - for (idx = 0; idx * BITS_PER_LONG < size; idx++) { - val = addr1[idx] & addr2[idx]; - if (val) - return min(idx * BITS_PER_LONG + __ffs(val), size); - } - - return size; + return FIND_FIRST_BIT(addr1[idx] & addr2[idx], /* nop */, size); } EXPORT_SYMBOL(_find_first_and_bit); #endif @@ -116,14 +122,7 @@ EXPORT_SYMBOL(_find_first_and_bit); */ unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size) { - unsigned long idx; - - for (idx = 0; idx * BITS_PER_LONG < size; idx++) { - if (addr[idx] != ~0UL) - return min(idx * BITS_PER_LONG + ffz(addr[idx]), size); - } - - return size; + return FIND_FIRST_BIT(~addr[idx], /* nop */, size); } EXPORT_SYMBOL(_find_first_zero_bit); #endif -- cgit From 14a99e130f2758bc826a7e7a8bdf6f7400b54f0f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 14 Sep 2022 19:07:28 -0700 Subject: lib/find_bit: create find_first_zero_bit_le() find_first_zero_bit_le() is an alias to find_next_zero_bit_le(), despite that 'next' is known to be slower than 'first' version. Now that we have common FIND_FIRST_BIT() macro helper, it's trivial to implement find_first_zero_bit_le() as a real function. Reviewed-by: Valentin Schneider Signed-off-by: Yury Norov --- lib/find_bit.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'lib') diff --git a/lib/find_bit.c b/lib/find_bit.c index 894b656f6836..61ec2992938b 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -160,3 +160,19 @@ unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr, return offset; } EXPORT_SYMBOL(find_next_clump8); + +#ifdef __BIG_ENDIAN + +#ifndef find_first_zero_bit_le +/* + * Find the first cleared bit in an LE memory region. + */ +unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size) +{ + return FIND_FIRST_BIT(~addr[idx], swab, size); +} +EXPORT_SYMBOL(_find_first_zero_bit_le); + +#endif + +#endif /* __BIG_ENDIAN */ -- cgit From e79864f3164c573afce09ec4b72b75ebe061c14d Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 14 Sep 2022 19:07:29 -0700 Subject: lib/find_bit: optimize find_next_bit() functions Over the past couple years, the function _find_next_bit() was extended with parameters that modify its behavior to implement and- zero- and le- flavors. The parameters are passed at compile time, but current design prevents a compiler from optimizing out the conditionals. As find_next_bit() API grows, I expect that more parameters will be added. Current design would require more conditional code in _find_next_bit(), which would bloat the helper even more and make it barely readable. This patch replaces _find_next_bit() with a macro FIND_NEXT_BIT, and adds a set of wrappers, so that the compile-time optimizations become possible. The common logic is moved to the new macro, and all flavors may be generated by providing a FETCH macro parameter, like in this example: #define FIND_NEXT_BIT(FETCH, MUNGE, size, start) ... find_next_xornot_and_bit(addr1, addr2, addr3, size, start) { return FIND_NEXT_BIT(addr1[idx] ^ ~addr2[idx] & addr3[idx], /* nop */, size, start); } The FETCH may be of any complexity, as soon as it only refers the bitmap(s) and an iterator idx. MUNGE is here to support _le code generation for BE builds. May be empty. I ran find_bit_benchmark 16 times on top of 6.0-rc2 and 16 times on top of 6.0-rc2 + this series. The results for kvm/x86_64 are: v6.0-rc2 Optimized Difference Z-score Random dense bitmap ns ns ns % find_next_bit: 787735 670546 117189 14.9 3.97 find_next_zero_bit: 777492 664208 113284 14.6 10.51 find_last_bit: 830925 687573 143352 17.3 2.35 find_first_bit: 3874366 3306635 567731 14.7 1.84 find_first_and_bit: 40677125 37739887 2937238 7.2 1.36 find_next_and_bit: 347865 304456 43409 12.5 1.35 Random sparse bitmap find_next_bit: 19816 14021 5795 29.2 6.10 find_next_zero_bit: 1318901 1223794 95107 7.2 1.41 find_last_bit: 14573 13514 1059 7.3 6.92 find_first_bit: 1313321 1249024 64297 4.9 1.53 find_first_and_bit: 8921 8098 823 9.2 4.56 find_next_and_bit: 9796 7176 2620 26.7 5.39 Where the statistics is significant (z-score > 3), the improvement is ~15%. According to the bloat-o-meter, the Image size is 10-11K less: x86_64/defconfig: add/remove: 32/14 grow/shrink: 61/782 up/down: 6344/-16521 (-10177) arm64/defconfig: add/remove: 3/2 grow/shrink: 50/714 up/down: 608/-11556 (-10948) Suggested-by: Linus Torvalds Signed-off-by: Yury Norov --- lib/find_bit.c | 119 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 70 insertions(+), 49 deletions(-) (limited to 'lib') diff --git a/lib/find_bit.c b/lib/find_bit.c index 61ec2992938b..d00ee23ab657 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -40,57 +40,33 @@ sz; \ }) -#if !defined(find_next_bit) || !defined(find_next_zero_bit) || \ - !defined(find_next_bit_le) || !defined(find_next_zero_bit_le) || \ - !defined(find_next_and_bit) /* - * This is a common helper function for find_next_bit, find_next_zero_bit, and - * find_next_and_bit. The differences are: - * - The "invert" argument, which is XORed with each fetched word before - * searching it for one bits. - * - The optional "addr2", which is anded with "addr1" if present. + * Common helper for find_next_bit() function family + * @FETCH: The expression that fetches and pre-processes each word of bitmap(s) + * @MUNGE: The expression that post-processes a word containing found bit (may be empty) + * @size: The bitmap size in bits + * @start: The bitnumber to start searching at */ -unsigned long _find_next_bit(const unsigned long *addr1, - const unsigned long *addr2, unsigned long nbits, - unsigned long start, unsigned long invert, unsigned long le) -{ - unsigned long tmp, mask; - - if (unlikely(start >= nbits)) - return nbits; - - tmp = addr1[start / BITS_PER_LONG]; - if (addr2) - tmp &= addr2[start / BITS_PER_LONG]; - tmp ^= invert; - - /* Handle 1st word. */ - mask = BITMAP_FIRST_WORD_MASK(start); - if (le) - mask = swab(mask); - - tmp &= mask; - - start = round_down(start, BITS_PER_LONG); - - while (!tmp) { - start += BITS_PER_LONG; - if (start >= nbits) - return nbits; - - tmp = addr1[start / BITS_PER_LONG]; - if (addr2) - tmp &= addr2[start / BITS_PER_LONG]; - tmp ^= invert; - } - - if (le) - tmp = swab(tmp); - - return min(start + __ffs(tmp), nbits); -} -EXPORT_SYMBOL(_find_next_bit); -#endif +#define FIND_NEXT_BIT(FETCH, MUNGE, size, start) \ +({ \ + unsigned long mask, idx, tmp, sz = (size), __start = (start); \ + \ + if (unlikely(__start >= sz)) \ + goto out; \ + \ + mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start)); \ + idx = __start / BITS_PER_LONG; \ + \ + for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) { \ + if ((idx + 1) * BITS_PER_LONG >= sz) \ + goto out; \ + idx++; \ + } \ + \ + sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz); \ +out: \ + sz; \ +}) #ifndef find_first_bit /* @@ -127,6 +103,32 @@ unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size EXPORT_SYMBOL(_find_first_zero_bit); #endif +#ifndef find_next_bit +unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, unsigned long start) +{ + return FIND_NEXT_BIT(addr[idx], /* nop */, nbits, start); +} +EXPORT_SYMBOL(_find_next_bit); +#endif + +#ifndef find_next_and_bit +unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long nbits, unsigned long start) +{ + return FIND_NEXT_BIT(addr1[idx] & addr2[idx], /* nop */, nbits, start); +} +EXPORT_SYMBOL(_find_next_and_bit); +#endif + +#ifndef find_next_zero_bit +unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, + unsigned long start) +{ + return FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start); +} +EXPORT_SYMBOL(_find_next_zero_bit); +#endif + #ifndef find_last_bit unsigned long _find_last_bit(const unsigned long *addr, unsigned long size) { @@ -175,4 +177,23 @@ EXPORT_SYMBOL(_find_first_zero_bit_le); #endif +#ifndef find_next_zero_bit_le +unsigned long _find_next_zero_bit_le(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return FIND_NEXT_BIT(~addr[idx], swab, size, offset); +} +EXPORT_SYMBOL(_find_next_zero_bit_le); +#endif + +#ifndef find_next_bit_le +unsigned long _find_next_bit_le(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return FIND_NEXT_BIT(addr[idx], swab, size, offset); +} +EXPORT_SYMBOL(_find_next_bit_le); + +#endif + #endif /* __BIG_ENDIAN */ -- cgit From 70a1cb106d9410f1f37e0261728e46722b74c29f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 17 Sep 2022 20:07:11 -0700 Subject: lib/bitmap: don't call __bitmap_weight() in kernel code __bitmap_weight() is not to be used directly in the kernel code because it's a helper for bitmap_weight(). Switch everything to bitmap_weight(). Signed-off-by: Yury Norov --- lib/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/bitmap.c b/lib/bitmap.c index 488e6c3e5acc..d56e275db73e 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -953,7 +953,7 @@ static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigne if (pos >= nbits || !test_bit(pos, buf)) return -1; - return __bitmap_weight(buf, pos); + return bitmap_weight(buf, pos); } /** -- cgit From 24291caf8447f6fc060c8d00136bdc30ee207f38 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 17 Sep 2022 20:07:12 -0700 Subject: lib/bitmap: add bitmap_weight_and() The function calculates Hamming weight of (bitmap1 & bitmap2). Now we have to do like this: tmp = bitmap_alloc(nbits); bitmap_and(tmp, map1, map2, nbits); weight = bitmap_weight(tmp, nbits); bitmap_free(tmp); This requires additional memory, adds pressure on alloc subsystem, and way less cache-friendly than just: weight = bitmap_weight_and(map1, map2, nbits); The following patches apply it for cpumask functions. Signed-off-by: Yury Norov --- lib/bitmap.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/bitmap.c b/lib/bitmap.c index d56e275db73e..3fc2e338ec30 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -333,20 +333,32 @@ bool __bitmap_subset(const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_subset); +#define BITMAP_WEIGHT(FETCH, bits) \ +({ \ + unsigned int __bits = (bits), idx, w = 0; \ + \ + for (idx = 0; idx < __bits / BITS_PER_LONG; idx++) \ + w += hweight_long(FETCH); \ + \ + if (__bits % BITS_PER_LONG) \ + w += hweight_long((FETCH) & BITMAP_LAST_WORD_MASK(__bits)); \ + \ + w; \ +}) + unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int bits) { - unsigned int k, lim = bits/BITS_PER_LONG, w = 0; - - for (k = 0; k < lim; k++) - w += hweight_long(bitmap[k]); - - if (bits % BITS_PER_LONG) - w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits)); - - return w; + return BITMAP_WEIGHT(bitmap[idx], bits); } EXPORT_SYMBOL(__bitmap_weight); +unsigned int __bitmap_weight_and(const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int bits) +{ + return BITMAP_WEIGHT(bitmap1[idx] & bitmap2[idx], bits); +} +EXPORT_SYMBOL(__bitmap_weight_and); + void __bitmap_set(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); -- cgit From 3cea8d475327756066e2a54f0b651bb7284dd448 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 17 Sep 2022 20:07:13 -0700 Subject: lib: add find_nth{,_and,_andnot}_bit() Kernel lacks for a function that searches for Nth bit in a bitmap. Usually people do it like this: for_each_set_bit(bit, mask, size) if (n-- == 0) return bit; We can do it more efficiently, if we: 1. find a word containing Nth bit, using hweight(); and 2. find the bit, using a helper fns(), that works similarly to __ffs() and ffz(). fns() is implemented as a simple loop. For x86_64, there's PDEP instruction to do that: ret = clz(pdep(1 << idx, num)). However, for large bitmaps the most of improvement comes from using hweight(), so I kept fns() simple. New find_nth_bit() is ~70 times faster on x86_64/kvm in find_bit benchmark: find_nth_bit: 7154190 ns, 16411 iterations for_each_bit: 505493126 ns, 16315 iterations With all that, a family of 3 new functions is added, and used where appropriate in the following patches. Signed-off-by: Yury Norov --- lib/find_bit.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'lib') diff --git a/lib/find_bit.c b/lib/find_bit.c index d00ee23ab657..25609974cbe4 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -68,6 +68,30 @@ out: \ sz; \ }) +#define FIND_NTH_BIT(FETCH, size, num) \ +({ \ + unsigned long sz = (size), nr = (num), idx, w, tmp; \ + \ + for (idx = 0; (idx + 1) * BITS_PER_LONG <= sz; idx++) { \ + if (idx * BITS_PER_LONG + nr >= sz) \ + goto out; \ + \ + tmp = (FETCH); \ + w = hweight_long(tmp); \ + if (w > nr) \ + goto found; \ + \ + nr -= w; \ + } \ + \ + if (sz % BITS_PER_LONG) \ + tmp = (FETCH) & BITMAP_LAST_WORD_MASK(sz); \ +found: \ + sz = min(idx * BITS_PER_LONG + fns(tmp, nr), sz); \ +out: \ + sz; \ +}) + #ifndef find_first_bit /* * Find the first set bit in a memory region. @@ -111,6 +135,26 @@ unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, uns EXPORT_SYMBOL(_find_next_bit); #endif +unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) +{ + return FIND_NTH_BIT(addr[idx], size, n); +} +EXPORT_SYMBOL(__find_nth_bit); + +unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long size, unsigned long n) +{ + return FIND_NTH_BIT(addr1[idx] & addr2[idx], size, n); +} +EXPORT_SYMBOL(__find_nth_and_bit); + +unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long size, unsigned long n) +{ + return FIND_NTH_BIT(addr1[idx] & ~addr2[idx], size, n); +} +EXPORT_SYMBOL(__find_nth_andnot_bit); + #ifndef find_next_and_bit unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start) -- cgit From e3783c805db29c8cb3e8dcc8160f6449da1100e3 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 17 Sep 2022 20:07:14 -0700 Subject: lib/bitmap: add tests for find_nth_bit() Add functional and performance tests for find_nth_bit(). Signed-off-by: Yury Norov --- lib/find_bit_benchmark.c | 18 ++++++++++++++++++ lib/test_bitmap.c | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 63 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/find_bit_benchmark.c b/lib/find_bit_benchmark.c index db904b57d4b8..10754586403b 100644 --- a/lib/find_bit_benchmark.c +++ b/lib/find_bit_benchmark.c @@ -115,6 +115,22 @@ static int __init test_find_last_bit(const void *bitmap, unsigned long len) return 0; } +static int __init test_find_nth_bit(const unsigned long *bitmap, unsigned long len) +{ + unsigned long l, n, w = bitmap_weight(bitmap, len); + ktime_t time; + + time = ktime_get(); + for (n = 0; n < w; n++) { + l = find_nth_bit(bitmap, len, n); + WARN_ON(l >= len); + } + time = ktime_get() - time; + pr_err("find_nth_bit: %18llu ns, %6ld iterations\n", time, w); + + return 0; +} + static int __init test_find_next_and_bit(const void *bitmap, const void *bitmap2, unsigned long len) { @@ -142,6 +158,7 @@ static int __init find_bit_test(void) test_find_next_bit(bitmap, BITMAP_LEN); test_find_next_zero_bit(bitmap, BITMAP_LEN); test_find_last_bit(bitmap, BITMAP_LEN); + test_find_nth_bit(bitmap, BITMAP_LEN / 10); /* * test_find_first_bit() may take some time, so @@ -164,6 +181,7 @@ static int __init find_bit_test(void) test_find_next_bit(bitmap, BITMAP_LEN); test_find_next_zero_bit(bitmap, BITMAP_LEN); test_find_last_bit(bitmap, BITMAP_LEN); + test_find_nth_bit(bitmap, BITMAP_LEN); test_find_first_bit(bitmap, BITMAP_LEN); test_find_first_and_bit(bitmap, bitmap2, BITMAP_LEN); test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN); diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 98754ff9fe68..da52dc759c95 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -16,6 +16,8 @@ #include "../tools/testing/selftests/kselftest_module.h" +#define EXP1_IN_BITS (sizeof(exp1) * 8) + KSTM_MODULE_GLOBALS(); static char pbl_buffer[PAGE_SIZE] __initdata; @@ -219,6 +221,47 @@ static void __init test_zero_clear(void) expect_eq_pbl("", bmap, 1024); } +static void __init test_find_nth_bit(void) +{ + unsigned long b, bit, cnt = 0; + DECLARE_BITMAP(bmap, 64 * 3); + + bitmap_zero(bmap, 64 * 3); + __set_bit(10, bmap); + __set_bit(20, bmap); + __set_bit(30, bmap); + __set_bit(40, bmap); + __set_bit(50, bmap); + __set_bit(60, bmap); + __set_bit(80, bmap); + __set_bit(123, bmap); + + expect_eq_uint(10, find_nth_bit(bmap, 64 * 3, 0)); + expect_eq_uint(20, find_nth_bit(bmap, 64 * 3, 1)); + expect_eq_uint(30, find_nth_bit(bmap, 64 * 3, 2)); + expect_eq_uint(40, find_nth_bit(bmap, 64 * 3, 3)); + expect_eq_uint(50, find_nth_bit(bmap, 64 * 3, 4)); + expect_eq_uint(60, find_nth_bit(bmap, 64 * 3, 5)); + expect_eq_uint(80, find_nth_bit(bmap, 64 * 3, 6)); + expect_eq_uint(123, find_nth_bit(bmap, 64 * 3, 7)); + expect_eq_uint(64 * 3, find_nth_bit(bmap, 64 * 3, 8)); + + expect_eq_uint(10, find_nth_bit(bmap, 64 * 3 - 1, 0)); + expect_eq_uint(20, find_nth_bit(bmap, 64 * 3 - 1, 1)); + expect_eq_uint(30, find_nth_bit(bmap, 64 * 3 - 1, 2)); + expect_eq_uint(40, find_nth_bit(bmap, 64 * 3 - 1, 3)); + expect_eq_uint(50, find_nth_bit(bmap, 64 * 3 - 1, 4)); + expect_eq_uint(60, find_nth_bit(bmap, 64 * 3 - 1, 5)); + expect_eq_uint(80, find_nth_bit(bmap, 64 * 3 - 1, 6)); + expect_eq_uint(123, find_nth_bit(bmap, 64 * 3 - 1, 7)); + expect_eq_uint(64 * 3 - 1, find_nth_bit(bmap, 64 * 3 - 1, 8)); + + for_each_set_bit(bit, exp1, EXP1_IN_BITS) { + b = find_nth_bit(exp1, EXP1_IN_BITS, cnt++); + expect_eq_uint(b, bit); + } +} + static void __init test_fill_set(void) { DECLARE_BITMAP(bmap, 1024); @@ -557,8 +600,6 @@ static void __init test_bitmap_parse(void) } } -#define EXP1_IN_BITS (sizeof(exp1) * 8) - static void __init test_bitmap_arr32(void) { unsigned int nbits, next_bit; @@ -952,6 +993,8 @@ static void __init selftest(void) test_bitmap_cut(); test_bitmap_print_buf(); test_bitmap_const_eval(); + + test_find_nth_bit(); } KSTM_MODULE_LOADERS(test_bitmap); -- cgit From 97848c10f9f8a8ce4296b149d06cab424eba05b3 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 17 Sep 2022 20:07:15 -0700 Subject: lib/bitmap: remove bitmap_ord_to_pos Now that we have find_nth_bit(), we can drop bitmap_ord_to_pos(). Signed-off-by: Yury Norov --- lib/bitmap.c | 36 +++--------------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) (limited to 'lib') diff --git a/lib/bitmap.c b/lib/bitmap.c index 3fc2e338ec30..1c81413c51f8 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -968,36 +968,6 @@ static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigne return bitmap_weight(buf, pos); } -/** - * bitmap_ord_to_pos - find position of n-th set bit in bitmap - * @buf: pointer to bitmap - * @ord: ordinal bit position (n-th set bit, n >= 0) - * @nbits: number of valid bit positions in @buf - * - * Map the ordinal offset of bit @ord in @buf to its position in @buf. - * Value of @ord should be in range 0 <= @ord < weight(buf). If @ord - * >= weight(buf), returns @nbits. - * - * If for example, just bits 4 through 7 are set in @buf, then @ord - * values 0 through 3 will get mapped to 4 through 7, respectively, - * and all other @ord values returns @nbits. When @ord value 3 - * gets mapped to (returns) @pos value 7 in this example, that means - * that the 3rd set bit (starting with 0th) is at position 7 in @buf. - * - * The bit positions 0 through @nbits-1 are valid positions in @buf. - */ -unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsigned int nbits) -{ - unsigned int pos; - - for (pos = find_first_bit(buf, nbits); - pos < nbits && ord; - pos = find_next_bit(buf, nbits, pos + 1)) - ord--; - - return pos; -} - /** * bitmap_remap - Apply map defined by a pair of bitmaps to another bitmap * @dst: remapped result @@ -1047,7 +1017,7 @@ void bitmap_remap(unsigned long *dst, const unsigned long *src, if (n < 0 || w == 0) set_bit(oldbit, dst); /* identity map */ else - set_bit(bitmap_ord_to_pos(new, n % w, nbits), dst); + set_bit(find_nth_bit(new, nbits, n % w), dst); } } EXPORT_SYMBOL(bitmap_remap); @@ -1086,7 +1056,7 @@ int bitmap_bitremap(int oldbit, const unsigned long *old, if (n < 0 || w == 0) return oldbit; else - return bitmap_ord_to_pos(new, n % w, bits); + return find_nth_bit(new, bits, n % w); } EXPORT_SYMBOL(bitmap_bitremap); @@ -1210,7 +1180,7 @@ void bitmap_onto(unsigned long *dst, const unsigned long *orig, * The following code is a more efficient, but less * obvious, equivalent to the loop: * for (m = 0; m < bitmap_weight(relmap, bits); m++) { - * n = bitmap_ord_to_pos(orig, m, bits); + * n = find_nth_bit(orig, bits, m); * if (test_bit(m, orig)) * set_bit(n, dst); * } -- cgit From 944c417daeb63fa345fe0f754c57a5a23ca6d701 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 17 Sep 2022 20:07:16 -0700 Subject: cpumask: add cpumask_nth_{,and,andnot} Add cpumask_nth_{,and,andnot} as wrappers around corresponding find functions, and use it in cpumask_local_spread(). Signed-off-by: Yury Norov --- lib/cpumask.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/cpumask.c b/lib/cpumask.c index f0ae119be8c4..2c4a63b6f03f 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -128,23 +128,21 @@ unsigned int cpumask_local_spread(unsigned int i, int node) i %= num_online_cpus(); if (node == NUMA_NO_NODE) { - for_each_cpu(cpu, cpu_online_mask) - if (i-- == 0) - return cpu; + cpu = cpumask_nth(i, cpu_online_mask); + if (cpu < nr_cpu_ids) + return cpu; } else { /* NUMA first. */ - for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask) - if (i-- == 0) - return cpu; - - for_each_cpu(cpu, cpu_online_mask) { - /* Skip NUMA nodes, done above. */ - if (cpumask_test_cpu(cpu, cpumask_of_node(node))) - continue; - - if (i-- == 0) - return cpu; - } + cpu = cpumask_nth_and(i, cpu_online_mask, cpumask_of_node(node)); + if (cpu < nr_cpu_ids) + return cpu; + + i -= cpumask_weight_and(cpu_online_mask, cpumask_of_node(node)); + + /* Skip NUMA nodes, done above. */ + cpu = cpumask_nth_andnot(i, cpu_online_mask, cpumask_of_node(node)); + if (cpu < nr_cpu_ids) + return cpu; } BUG(); } -- cgit From 6cc18331a987c4a29d66b9c4fd292587fba4d7bd Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 19 Sep 2022 14:05:56 -0700 Subject: lib/find_bit: add find_next{,_and}_bit_wrap The helper is better optimized for the worst case: in case of empty cpumask, current code traverses 2 * size: next = cpumask_next_and(prev, src1p, src2p); if (next >= nr_cpu_ids) next = cpumask_first_and(src1p, src2p); At bitmap level we can stop earlier after checking 'size + offset' bits. Signed-off-by: Yury Norov --- lib/cpumask.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/cpumask.c b/lib/cpumask.c index 2c4a63b6f03f..c7c392514fd3 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -166,10 +166,8 @@ unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, /* NOTE: our first selection will skip 0. */ prev = __this_cpu_read(distribute_cpu_mask_prev); - next = cpumask_next_and(prev, src1p, src2p); - if (next >= nr_cpu_ids) - next = cpumask_first_and(src1p, src2p); - + next = find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p), + nr_cpumask_bits, prev + 1); if (next < nr_cpu_ids) __this_cpu_write(distribute_cpu_mask_prev, next); @@ -183,11 +181,7 @@ unsigned int cpumask_any_distribute(const struct cpumask *srcp) /* NOTE: our first selection will skip 0. */ prev = __this_cpu_read(distribute_cpu_mask_prev); - - next = cpumask_next(prev, srcp); - if (next >= nr_cpu_ids) - next = cpumask_first(srcp); - + next = find_next_bit_wrap(cpumask_bits(srcp), nr_cpumask_bits, prev + 1); if (next < nr_cpu_ids) __this_cpu_write(distribute_cpu_mask_prev, next); -- cgit From 8173aa26260e6d0153db0c7135d41a4da612da5b Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 19 Sep 2022 14:05:59 -0700 Subject: lib/bitmap: add tests for for_each() loops We have a test for test_for_each_set_clump8 only. Add basic tests for the others. Signed-off-by: Yury Norov --- lib/test_bitmap.c | 244 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 243 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index da52dc759c95..a8005ad3bd58 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -726,6 +726,239 @@ static void __init test_for_each_set_clump8(void) expect_eq_clump8(start, CLUMP_EXP_NUMBITS, clump_exp, &clump); } +static void __init test_for_each_set_bit_wrap(void) +{ + DECLARE_BITMAP(orig, 500); + DECLARE_BITMAP(copy, 500); + unsigned int wr, bit; + + bitmap_zero(orig, 500); + + /* Set individual bits */ + for (bit = 0; bit < 500; bit += 10) + bitmap_set(orig, bit, 1); + + /* Set range of bits */ + bitmap_set(orig, 100, 50); + + for (wr = 0; wr < 500; wr++) { + bitmap_zero(copy, 500); + + for_each_set_bit_wrap(bit, orig, 500, wr) + bitmap_set(copy, bit, 1); + + expect_eq_bitmap(orig, copy, 500); + } +} + +static void __init test_for_each_set_bit(void) +{ + DECLARE_BITMAP(orig, 500); + DECLARE_BITMAP(copy, 500); + unsigned int bit; + + bitmap_zero(orig, 500); + bitmap_zero(copy, 500); + + /* Set individual bits */ + for (bit = 0; bit < 500; bit += 10) + bitmap_set(orig, bit, 1); + + /* Set range of bits */ + bitmap_set(orig, 100, 50); + + for_each_set_bit(bit, orig, 500) + bitmap_set(copy, bit, 1); + + expect_eq_bitmap(orig, copy, 500); +} + +static void __init test_for_each_set_bit_from(void) +{ + DECLARE_BITMAP(orig, 500); + DECLARE_BITMAP(copy, 500); + unsigned int wr, bit; + + bitmap_zero(orig, 500); + + /* Set individual bits */ + for (bit = 0; bit < 500; bit += 10) + bitmap_set(orig, bit, 1); + + /* Set range of bits */ + bitmap_set(orig, 100, 50); + + for (wr = 0; wr < 500; wr++) { + DECLARE_BITMAP(tmp, 500); + + bitmap_zero(copy, 500); + bit = wr; + + for_each_set_bit_from(bit, orig, 500) + bitmap_set(copy, bit, 1); + + bitmap_copy(tmp, orig, 500); + bitmap_clear(tmp, 0, wr); + expect_eq_bitmap(tmp, copy, 500); + } +} + +static void __init test_for_each_clear_bit(void) +{ + DECLARE_BITMAP(orig, 500); + DECLARE_BITMAP(copy, 500); + unsigned int bit; + + bitmap_fill(orig, 500); + bitmap_fill(copy, 500); + + /* Set individual bits */ + for (bit = 0; bit < 500; bit += 10) + bitmap_clear(orig, bit, 1); + + /* Set range of bits */ + bitmap_clear(orig, 100, 50); + + for_each_clear_bit(bit, orig, 500) + bitmap_clear(copy, bit, 1); + + expect_eq_bitmap(orig, copy, 500); +} + +static void __init test_for_each_clear_bit_from(void) +{ + DECLARE_BITMAP(orig, 500); + DECLARE_BITMAP(copy, 500); + unsigned int wr, bit; + + bitmap_fill(orig, 500); + + /* Set individual bits */ + for (bit = 0; bit < 500; bit += 10) + bitmap_clear(orig, bit, 1); + + /* Set range of bits */ + bitmap_clear(orig, 100, 50); + + for (wr = 0; wr < 500; wr++) { + DECLARE_BITMAP(tmp, 500); + + bitmap_fill(copy, 500); + bit = wr; + + for_each_clear_bit_from(bit, orig, 500) + bitmap_clear(copy, bit, 1); + + bitmap_copy(tmp, orig, 500); + bitmap_set(tmp, 0, wr); + expect_eq_bitmap(tmp, copy, 500); + } +} + +static void __init test_for_each_set_bitrange(void) +{ + DECLARE_BITMAP(orig, 500); + DECLARE_BITMAP(copy, 500); + unsigned int s, e; + + bitmap_zero(orig, 500); + bitmap_zero(copy, 500); + + /* Set individual bits */ + for (s = 0; s < 500; s += 10) + bitmap_set(orig, s, 1); + + /* Set range of bits */ + bitmap_set(orig, 100, 50); + + for_each_set_bitrange(s, e, orig, 500) + bitmap_set(copy, s, e-s); + + expect_eq_bitmap(orig, copy, 500); +} + +static void __init test_for_each_clear_bitrange(void) +{ + DECLARE_BITMAP(orig, 500); + DECLARE_BITMAP(copy, 500); + unsigned int s, e; + + bitmap_fill(orig, 500); + bitmap_fill(copy, 500); + + /* Set individual bits */ + for (s = 0; s < 500; s += 10) + bitmap_clear(orig, s, 1); + + /* Set range of bits */ + bitmap_clear(orig, 100, 50); + + for_each_clear_bitrange(s, e, orig, 500) + bitmap_clear(copy, s, e-s); + + expect_eq_bitmap(orig, copy, 500); +} + +static void __init test_for_each_set_bitrange_from(void) +{ + DECLARE_BITMAP(orig, 500); + DECLARE_BITMAP(copy, 500); + unsigned int wr, s, e; + + bitmap_zero(orig, 500); + + /* Set individual bits */ + for (s = 0; s < 500; s += 10) + bitmap_set(orig, s, 1); + + /* Set range of bits */ + bitmap_set(orig, 100, 50); + + for (wr = 0; wr < 500; wr++) { + DECLARE_BITMAP(tmp, 500); + + bitmap_zero(copy, 500); + s = wr; + + for_each_set_bitrange_from(s, e, orig, 500) + bitmap_set(copy, s, e - s); + + bitmap_copy(tmp, orig, 500); + bitmap_clear(tmp, 0, wr); + expect_eq_bitmap(tmp, copy, 500); + } +} + +static void __init test_for_each_clear_bitrange_from(void) +{ + DECLARE_BITMAP(orig, 500); + DECLARE_BITMAP(copy, 500); + unsigned int wr, s, e; + + bitmap_fill(orig, 500); + + /* Set individual bits */ + for (s = 0; s < 500; s += 10) + bitmap_clear(orig, s, 1); + + /* Set range of bits */ + bitmap_set(orig, 100, 50); + + for (wr = 0; wr < 500; wr++) { + DECLARE_BITMAP(tmp, 500); + + bitmap_fill(copy, 500); + s = wr; + + for_each_clear_bitrange_from(s, e, orig, 500) + bitmap_clear(copy, s, e - s); + + bitmap_copy(tmp, orig, 500); + bitmap_set(tmp, 0, wr); + expect_eq_bitmap(tmp, copy, 500); + } +} + struct test_bitmap_cut { unsigned int first; unsigned int cut; @@ -989,12 +1222,21 @@ static void __init selftest(void) test_bitmap_parselist(); test_bitmap_printlist(); test_mem_optimisations(); - test_for_each_set_clump8(); test_bitmap_cut(); test_bitmap_print_buf(); test_bitmap_const_eval(); test_find_nth_bit(); + test_for_each_set_bit(); + test_for_each_set_bit_from(); + test_for_each_clear_bit(); + test_for_each_clear_bit_from(); + test_for_each_set_bitrange(); + test_for_each_clear_bitrange(); + test_for_each_set_bitrange_from(); + test_for_each_clear_bitrange_from(); + test_for_each_set_clump8(); + test_for_each_set_bit_wrap(); } KSTM_MODULE_LOADERS(test_bitmap); -- cgit From 90d482908eedd56f01a707325aa541cf9c40f936 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 3 Oct 2022 16:34:17 +0100 Subject: lib/find_bit: Introduce find_next_andnot_bit() In preparation of introducing for_each_cpu_andnot(), add a variant of find_next_bit() that negate the bits in @addr2 when ANDing them with the bits in @addr1. Signed-off-by: Valentin Schneider --- lib/find_bit.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'lib') diff --git a/lib/find_bit.c b/lib/find_bit.c index 25609974cbe4..18bc0a7ac8ee 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -164,6 +164,15 @@ unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long EXPORT_SYMBOL(_find_next_and_bit); #endif +#ifndef find_next_andnot_bit +unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long nbits, unsigned long start) +{ + return FIND_NEXT_BIT(addr1[idx] & ~addr2[idx], /* nop */, nbits, start); +} +EXPORT_SYMBOL(_find_next_andnot_bit); +#endif + #ifndef find_next_zero_bit unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start) -- cgit From 49937cd12331cc3966b3f8628253fe62fbbd35a1 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 3 Oct 2022 16:34:19 +0100 Subject: lib/test_cpumask: Add for_each_cpu_and(not) tests Following the recent introduction of for_each_andnot(), add some tests to ensure for_each_cpu_and(not) results in the same as iterating over the result of cpumask_and(not)(). Suggested-by: Yury Norov Signed-off-by: Valentin Schneider --- lib/cpumask_kunit.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'lib') diff --git a/lib/cpumask_kunit.c b/lib/cpumask_kunit.c index ecbeec72221e..d1fc6ece21f3 100644 --- a/lib/cpumask_kunit.c +++ b/lib/cpumask_kunit.c @@ -33,6 +33,19 @@ KUNIT_EXPECT_EQ_MSG((test), nr_cpu_ids - mask_weight, iter, MASK_MSG(mask)); \ } while (0) +#define EXPECT_FOR_EACH_CPU_OP_EQ(test, op, mask1, mask2) \ + do { \ + const cpumask_t *m1 = (mask1); \ + const cpumask_t *m2 = (mask2); \ + int weight; \ + int cpu, iter = 0; \ + cpumask_##op(&mask_tmp, m1, m2); \ + weight = cpumask_weight(&mask_tmp); \ + for_each_cpu_##op(cpu, mask1, mask2) \ + iter++; \ + KUNIT_EXPECT_EQ((test), weight, iter); \ + } while (0) + #define EXPECT_FOR_EACH_CPU_WRAP_EQ(test, mask) \ do { \ const cpumask_t *m = (mask); \ @@ -54,6 +67,7 @@ static cpumask_t mask_empty; static cpumask_t mask_all; +static cpumask_t mask_tmp; static void test_cpumask_weight(struct kunit *test) { @@ -101,10 +115,15 @@ static void test_cpumask_iterators(struct kunit *test) EXPECT_FOR_EACH_CPU_EQ(test, &mask_empty); EXPECT_FOR_EACH_CPU_NOT_EQ(test, &mask_empty); EXPECT_FOR_EACH_CPU_WRAP_EQ(test, &mask_empty); + EXPECT_FOR_EACH_CPU_OP_EQ(test, and, &mask_empty, &mask_empty); + EXPECT_FOR_EACH_CPU_OP_EQ(test, and, cpu_possible_mask, &mask_empty); + EXPECT_FOR_EACH_CPU_OP_EQ(test, andnot, &mask_empty, &mask_empty); EXPECT_FOR_EACH_CPU_EQ(test, cpu_possible_mask); EXPECT_FOR_EACH_CPU_NOT_EQ(test, cpu_possible_mask); EXPECT_FOR_EACH_CPU_WRAP_EQ(test, cpu_possible_mask); + EXPECT_FOR_EACH_CPU_OP_EQ(test, and, cpu_possible_mask, cpu_possible_mask); + EXPECT_FOR_EACH_CPU_OP_EQ(test, andnot, cpu_possible_mask, &mask_empty); } static void test_cpumask_iterators_builtin(struct kunit *test) -- cgit