diff options
author | Michael Brown <mcb30@ipxe.org> | 2024-09-19 16:23:32 +0100 |
---|---|---|
committer | Michael Brown <mcb30@ipxe.org> | 2024-09-23 13:19:58 +0100 |
commit | 3def13265d9475c861eed1a101584b761e97ae33 (patch) | |
tree | 5ce095cf73fa7fde7baf8ef65fb826a0881d80e2 /src/crypto | |
parent | 59d123658bfe25402c4e89bbaf6eea83140d3c1f (diff) | |
download | ipxe-3def13265d9475c861eed1a101584b761e97ae33.tar.gz |
[crypto] Use constant-time big integer multiplication
Big integer multiplication currently performs immediate carry
propagation from each step of the long multiplication, relying on the
fact that the overall result has a known maximum value to minimise the
number of carries performed without ever needing to explicitly check
against the result buffer size.
This is not a constant-time algorithm, since the number of carries
performed will be a function of the input values. We could make it
constant-time by always continuing to propagate the carry until
reaching the end of the result buffer, but this would introduce a
large number of redundant zero carries.
Require callers of bigint_multiply() to provide a temporary carry
storage buffer, of the same size as the result buffer. This allows
the carry-out from the accumulation of each double-element product to
be accumulated in the temporary carry space, and then added in via a
single call to bigint_add() after the multiplication is complete.
Since the structure of big integer multiplication is identical across
all current CPU architectures, provide a single shared implementation
of bigint_multiply(). The architecture-specific operation then
becomes the multiplication of two big integer elements and the
accumulation of the double-element product.
Note that any intermediate carry arising from accumulating the lower
half of the double-element product may be added to the upper half of
the double-element product without risk of overflow, since the result
of multiplying two n-bit integers can never have all n bits set in its
upper half. This simplifies the carry calculations for architectures
such as RISC-V and LoongArch64 that do not have a carry flag.
Signed-off-by: Michael Brown <mcb30@ipxe.org>
Diffstat (limited to 'src/crypto')
-rw-r--r-- | src/crypto/bigint.c | 117 | ||||
-rw-r--r-- | src/crypto/x25519.c | 83 |
2 files changed, 169 insertions, 31 deletions
diff --git a/src/crypto/bigint.c b/src/crypto/bigint.c index 656f979e5..5b7116e28 100644 --- a/src/crypto/bigint.c +++ b/src/crypto/bigint.c @@ -76,6 +76,115 @@ void bigint_swap_raw ( bigint_element_t *first0, bigint_element_t *second0, } /** + * Multiply big integers + * + * @v multiplicand0 Element 0 of big integer to be multiplied + * @v multiplicand_size Number of elements in multiplicand + * @v multiplier0 Element 0 of big integer to be multiplied + * @v multiplier_size Number of elements in multiplier + * @v result0 Element 0 of big integer to hold result + * @v carry0 Element 0 of big integer to hold temporary carry + */ +void bigint_multiply_raw ( const bigint_element_t *multiplicand0, + unsigned int multiplicand_size, + const bigint_element_t *multiplier0, + unsigned int multiplier_size, + bigint_element_t *result0, + bigint_element_t *carry0 ) { + unsigned int result_size = ( multiplicand_size + multiplier_size ); + const bigint_t ( multiplicand_size ) __attribute__ (( may_alias )) + *multiplicand = ( ( const void * ) multiplicand0 ); + const bigint_t ( multiplier_size ) __attribute__ (( may_alias )) + *multiplier = ( ( const void * ) multiplier0 ); + bigint_t ( result_size ) __attribute__ (( may_alias )) + *result = ( ( void * ) result0 ); + bigint_t ( result_size ) __attribute__ (( may_alias )) + *carry = ( ( void * ) carry0 ); + bigint_element_t multiplicand_element; + const bigint_element_t *multiplier_element; + bigint_element_t *result_elements; + bigint_element_t *carry_element; + unsigned int i; + unsigned int j; + + /* Zero result and temporary carry space */ + memset ( result, 0, sizeof ( *result ) ); + memset ( carry, 0, sizeof ( *carry ) ); + + /* Multiply integers one element at a time, adding the double + * element directly into the result and accumulating any + * overall carry out from this double-element addition into + * the temporary carry space. + * + * We could propagate the carry immediately instead of using a + * temporary carry space. However, this would cause the + * multiplication to run in non-constant time, which is + * undesirable. + * + * The carry elements can never overflow, provided that the + * element size is large enough to accommodate any plausible + * big integer. The total number of potential carries (across + * all elements) is the sum of the number of elements in the + * multiplicand and multiplier. With a 16-bit element size, + * this therefore allows for up to a 1Mbit multiplication + * result (e.g. a 512kbit integer multiplied by another + * 512kbit integer), which is around 100x higher than could be + * needed in practice. With a more realistic 32-bit element + * size, the limit becomes a totally implausible 128Gbit + * multiplication result. + */ + for ( i = 0 ; i < multiplicand_size ; i++ ) { + multiplicand_element = multiplicand->element[i]; + multiplier_element = &multiplier->element[0]; + result_elements = &result->element[i]; + carry_element = &carry->element[i]; + for ( j = 0 ; j < multiplier_size ; j++ ) { + bigint_multiply_one ( multiplicand_element, + *(multiplier_element++), + result_elements++, + carry_element++ ); + } + } + + /* Add the temporary carry into the result. The least + * significant element of the carry represents the carry out + * from multiplying the least significant elements of the + * multiplicand and multiplier, and therefore must be added to + * the third-least significant element of the result (i.e. the + * carry needs to be shifted left by two elements before being + * adding to the result). + * + * The most significant two elements of the carry are + * guaranteed to be zero, since: + * + * a < 2^{n}, b < 2^{m} => ab < 2^{n+m} + * + * and the overall result of the multiplication (including + * adding in the shifted carries) is therefore guaranteed not + * to overflow beyond the end of the result. + * + * We could avoid this shifting by writing the carry directly + * into the "correct" element during the element-by-element + * multiplication stage above. However, this would add + * complexity to the loop since we would have to arrange for + * the (provably zero) most significant two carry out results + * to be discarded, in order to avoid writing beyond the end + * of the temporary carry space. + * + * Performing the logical shift is essentially free, since we + * simply adjust the element pointers. + * + * To avoid requiring additional checks in each architecture's + * implementation of bigint_add_raw(), we explicitly avoid + * calling bigint_add_raw() with a size of zero. + */ + if ( result_size > 2 ) { + bigint_add_raw ( &carry->element[0], &result->element[2], + ( result_size - 2 ) ); + } +} + +/** * Perform modular multiplication of big integers * * @v multiplicand0 Element 0 of big integer to be multiplied @@ -100,7 +209,10 @@ void bigint_mod_multiply_raw ( const bigint_element_t *multiplicand0, ( ( void * ) result0 ); struct { bigint_t ( size * 2 ) result; - bigint_t ( size * 2 ) modulus; + union { + bigint_t ( size * 2 ) modulus; + bigint_t ( size * 2 ) carry; + }; } *temp = tmp; int rotation; int i; @@ -113,7 +225,8 @@ void bigint_mod_multiply_raw ( const bigint_element_t *multiplicand0, /* Perform multiplication */ profile_start ( &bigint_mod_multiply_multiply_profiler ); - bigint_multiply ( multiplicand, multiplier, &temp->result ); + bigint_multiply ( multiplicand, multiplier, &temp->result, + &temp->carry ); profile_stop ( &bigint_mod_multiply_multiply_profiler ); /* Rescale modulus to match result */ diff --git a/src/crypto/x25519.c b/src/crypto/x25519.c index d58f7168c..553f43d34 100644 --- a/src/crypto/x25519.c +++ b/src/crypto/x25519.c @@ -43,7 +43,7 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); * Storage size of each big integer 128 40 * (in bytes) * - * Stack usage for key exchange 1144 360 + * Stack usage for key exchange 1144 424 * (in bytes, large objects only) * * Cost of big integer addition 16 5 @@ -207,35 +207,60 @@ union x25519_multiply_step3 { * We overlap the buffers used by each step of the multiplication * calculation to reduce the total stack space required: * - * |--------------------------------------------------------| - * | <- pad -> | <------------ step 1 result -------------> | - * | | <- low 256 bits -> | <-- high 260 bits --> | - * | <------- step 2 result ------> | <-- step 3 result --> | - * |--------------------------------------------------------| + * |--------------------------------------------------------------------------| + * | <------- step 1 carry ------> | <----------- step 1 result ------------> | + * | | <- low 256 bits -> | <- high 260 bits -> | + * | <- step 2 carry -> | <-- step 2 result --> | <pad> | | + * | <- s3 carry -> | <--------- pad ---------> | <- step 3 result -> | | + * |--------------------------------------------------------------------------| */ union x25519_multiply_workspace { - /** Step 1 result */ + /** Step 1 */ struct { - /** Padding to avoid collision between steps 1 and 2 - * - * The step 2 multiplication consumes the high 260 - * bits of step 1, and so the step 2 multiplication - * result must not overlap this portion of the step 1 - * result. - */ - uint8_t pad[ sizeof ( union x25519_multiply_step2 ) - - offsetof ( union x25519_multiply_step1, - parts.high_260bit ) ]; + /** Step 1 temporary carry workspace */ + union x25519_multiply_step1 carry; /** Step 1 result */ - union x25519_multiply_step1 step1; - } __attribute__ (( packed )); - /** Steps 2 and 3 results */ + union x25519_multiply_step1 result; + } __attribute__ (( packed )) step1; + /** Step 2 + * + * The step 2 multiplication consumes the high 260 bits of + * step 1, and so the step 2 multiplication result (and + * temporary carry workspace) must not overlap this portion of + * the step 1 result. + */ struct { + /** Step 2 temporary carry workspace */ + union x25519_multiply_step2 carry; /** Step 2 result */ - union x25519_multiply_step2 step2; + union x25519_multiply_step2 result; + /** Avoid collision between step 1 result and step 2 result */ + uint8_t pad[ ( int ) + ( sizeof ( union x25519_multiply_step1 ) + + offsetof ( union x25519_multiply_step1, + parts.high_260bit ) - + sizeof ( union x25519_multiply_step2 ) - + sizeof ( union x25519_multiply_step2 ) ) ]; + } __attribute__ (( packed )) step2; + /** Step 3 + * + * The step 3 multiplication consumes the high 11 bits of step + * 2, and so the step 3 multiplication result (and temporary + * carry workspace) must not overlap this portion of the step + * 2 result. + */ + struct { + /** Step 3 temporary carry workspace */ + union x25519_multiply_step3 carry; + /** Avoid collision between step 2 result and step 3 carry */ + uint8_t pad1[ ( int ) + ( sizeof ( union x25519_multiply_step2 ) - + sizeof ( union x25519_multiply_step3 ) ) ]; + /** Avoid collision between step 2 result and step 3 result */ + uint8_t pad2[ sizeof ( union x25519_multiply_step2 ) ]; /** Step 3 result */ - union x25519_multiply_step3 step3; - } __attribute__ (( packed )); + union x25519_multiply_step3 result; + } __attribute__ (( packed )) step3; }; /** An X25519 elliptic curve point in projective coordinates @@ -426,9 +451,9 @@ void x25519_multiply ( const union x25519_oct258 *multiplicand, const union x25519_oct258 *multiplier, union x25519_quad257 *result ) { union x25519_multiply_workspace tmp; - union x25519_multiply_step1 *step1 = &tmp.step1; - union x25519_multiply_step2 *step2 = &tmp.step2; - union x25519_multiply_step3 *step3 = &tmp.step3; + union x25519_multiply_step1 *step1 = &tmp.step1.result; + union x25519_multiply_step2 *step2 = &tmp.step2.result; + union x25519_multiply_step3 *step3 = &tmp.step3.result; /* Step 1: perform raw multiplication * @@ -439,7 +464,7 @@ void x25519_multiply ( const union x25519_oct258 *multiplicand, */ static_assert ( sizeof ( step1->product ) >= sizeof ( step1->parts ) ); bigint_multiply ( &multiplicand->value, &multiplier->value, - &step1->product ); + &step1->product, &tmp.step1.carry.product ); /* Step 2: reduce high-order 516-256=260 bits of step 1 result * @@ -465,7 +490,7 @@ void x25519_multiply ( const union x25519_oct258 *multiplicand, static_assert ( sizeof ( step2->product ) >= sizeof ( step2->parts ) ); bigint_grow ( &step1->parts.low_256bit, &result->value ); bigint_multiply ( &step1->parts.high_260bit, &x25519_reduce_256, - &step2->product ); + &step2->product, &tmp.step2.carry.product ); bigint_add ( &result->value, &step2->value ); /* Step 3: reduce high-order 267-256=11 bits of step 2 result @@ -503,7 +528,7 @@ void x25519_multiply ( const union x25519_oct258 *multiplicand, memset ( &step3->value, 0, sizeof ( step3->value ) ); bigint_grow ( &step2->parts.low_256bit, &result->value ); bigint_multiply ( &step2->parts.high_11bit, &x25519_reduce_256, - &step3->product ); + &step3->product, &tmp.step3.carry.product ); bigint_add ( &step3->value, &result->value ); /* Step 1 calculates the product of the input operands, and |