crypto: x86/poly1305 - wire up faster implementations for kernel

These x86_64 vectorized implementations support AVX, AVX-2, and AVX512F. The AVX-512F implementation is disabled on Skylake, due to throttling, but it is quite fast on >= Cannonlake. On the left is cycle counts on a Core i7 6700HQ using the AVX-2 codepath, comparing this implementation ("new") to the implementation in the current crypto api ("old"). On the right are benchmarks on a Xeon Gold 5120 using the AVX-512 codepath. The new implementation is faster on all benchmarks. AVX-2 AVX-512 --------- ----------- size old new size old new ---- ---- ---- ---- ---- ---- 0 70 68 0 74 70 16 92 90 16 96 92 32 134 104 32 136 106 48 172 120 48 184 124 64 218 136 64 218 138 80 254 158 80 260 160 96 298 174 96 300 176 112 342 192 112 342 194 128 388 212 128 384 212 144 428 228 144 420 226 160 466 246 160 464 248 176 510 264 176 504 264 192 550 282 192 544 282 208 594 302 208 582 300 224 628 316 224 624 318 240 676 334 240 662 338 256 716 354 256 708 358 272 764 374 272 748 372 288 802 352 288 788 358 304 420 366 304 422 370 320 428 360 320 432 364 336 484 378 336 486 380 352 426 384 352 434 390 368 478 400 368 480 408 384 488 394 384 490 398 400 542 408 400 542 412 416 486 416 416 492 426 432 534 430 432 538 436 448 544 422 448 546 432 464 600 438 464 600 448 480 540 448 480 548 456 496 594 464 496 594 476 512 602 456 512 606 470 528 656 476 528 656 480 544 600 480 544 606 498 560 650 494 560 652 512 576 664 490 576 662 508 592 714 508 592 716 522 608 656 514 608 664 538 624 708 532 624 710 552 640 716 524 640 720 516 656 770 536 656 772 526 672 716 548 672 722 544 688 770 562 688 768 556 704 774 552 704 778 556 720 826 568 720 832 568 736 768 574 736 780 584 752 822 592 752 826 600 768 830 584 768 836 560 784 884 602 784 888 572 800 828 610 800 838 588 816 884 628 816 884 604 832 888 618 832 894 598 848 942 632 848 946 612 864 884 644 864 896 628 880 936 660 880 942 644 896 948 652 896 952 608 912 1000 664 912 1004 616 928 942 676 928 954 634 944 994 690 944 1000 646 960 1002 680 960 1008 646 976 1054 694 976 1062 658 992 1002 706 992 1012 674 1008 1052 720 1008 1058 690 This commit wires in the prior implementation from Andy, and makes the following changes to be suitable for kernel land. - Some cosmetic and structural changes, like renaming labels to .Lname, constants, and other Linux conventions, as well as making the code easy for us to maintain moving forward. - CPU feature checking is done in C by the glue code. - We avoid jumping into the middle of functions, to appease objtool, and instead parameterize shared code. - We maintain frame pointers so that stack traces make sense. - We remove the dependency on the perl xlate code, which transforms the output into things that assemblers we don't care about use. Importantly, none of our changes affect the arithmetic or core code, but just involve the differing environment of kernel space. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> Signed-off-by: Samuel Neves <sneves@dei.uc.pt> Co-developed-by: Samuel Neves <sneves@dei.uc.pt> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Jason A. Donenfeld <Jason@zx2c4.com> 2020-01-05 22:40:48 -0500
committer: Herbert Xu <herbert@gondor.apana.org.au> 2020-01-16 15:18:12 +0800
commit: d7d7b853566254648df59f7ea27ea05952a6cfa8 (patch)
tree: 1dd0bf094b188627ba18438f51df753d5d969086 /arch/x86/crypto/poly1305-x86_64-cryptogams.pl
parent: 0896ca2a0cb6127e8a129f1f2a680d49b6b0f65c (diff)
download: linux-d7d7b853566254648df59f7ea27ea05952a6cfa8.tar.gz
1 files changed, 394 insertions, 288 deletions
diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
index 342ad7f18aa7..7a6b5380a46f 100644
--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
@@ -1,11 +1,14 @@
-#! /usr/bin/env perl
-# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 #
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
+# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+#
+# This code is taken from the OpenSSL project but the author, Andy Polyakov,
+# has relicensed it under the licenses specified in the SPDX header above.
+# The original headers, including the original license headers, are
+# included below for completeness.
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -32,7 +35,7 @@
 # Skylake-X system performance. Since we are likely to suppress
 # AVX512F capability flag [at least on Skylake-X], conversion serves
 # as kind of "investment protection". Note that next *lake processor,
-# Cannolake, has AVX512IFMA code path to execute...
+# Cannonlake, has AVX512IFMA code path to execute...
 #
 # Numbers are cycles per processed byte with poly1305_blocks alone,
 # measured with rdtsc at fixed clock frequency.
@@ -68,39 +71,114 @@ $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 
 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
-		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
-	$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
+$kernel=0; $kernel=1 if (!$flavour && !$output);
+
+if (!$kernel) {
+	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+	die "can't locate x86_64-xlate.pl";
+
+	open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+	*STDOUT=*OUT;
+
+	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+	    =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+		$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
+	}
+
+	if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+		$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
+		$avx += 1 if ($1==2.11 && $2>=8);
+	}
+
+	if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+		$avx = ($1>=10) + ($1>=11);
+	}
+
+	if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+		$avx = ($2>=3.0) + ($2>3.0);
+	}
+} else {
+	$avx = 4; # The kernel uses ifdefs for this.
 }
 
-if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
-	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
-	$avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
-	$avx += 2 if ($1==2.11 && $2>=8);
+sub declare_function() {
+	my ($name, $align, $nargs) = @_;
+	if($kernel) {
+		$code .= ".align $align\n";
+		$code .= "SYM_FUNC_START($name)\n";
+		$code .= ".L$name:\n";
+	} else {
+		$code .= ".globl	$name\n";
+		$code .= ".type	$name,\@function,$nargs\n";
+		$code .= ".align	$align\n";
+		$code .= "$name:\n";
+	}
 }
 
-if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
-	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
-	$avx = ($1>=10) + ($1>=12);
+sub end_function() {
+	my ($name) = @_;
+	if($kernel) {
+		$code .= "SYM_FUNC_END($name)\n";
+	} else {
+		$code .= ".size   $name,.-$name\n";
+	}
 }
 
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
-	$avx = ($2>=3.0) + ($2>3.0);
-}
+$code.=<<___ if $kernel;
+#include <linux/linkage.h>
+___
+
+if ($avx) {
+$code.=<<___ if $kernel;
+.section .rodata
+___
+$code.=<<___;
+.align	64
+.Lconst:
+.Lmask24:
+.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.L129:
+.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
+.Lmask26:
+.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.Lpermd_avx2:
+.long	2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.L2_44_inp_permd:
+.long	0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad	0,12,24,64
+.L2_44_mask:
+.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad	44,44,42,64
+.L2_44_shift_lft:
+.quad	8,8,10,64
 
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
-*STDOUT=*OUT;
+.align	64
+.Lx_mask44:
+.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.Lx_mask42:
+.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+___
+}
+$code.=<<___ if (!$kernel);
+.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	16
+___
 
 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
 my ($mac,$nonce)=($inp,$len);	# *_emit arguments
-my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
-my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
+my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
+my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
 
 sub poly1305_iteration {
 # input:	copy of $r1 in %rax, $h0-$h2, $r0-$r1
@@ -155,19 +233,19 @@ ___
 
 $code.=<<___;
 .text
-
+___
+$code.=<<___ if (!$kernel);
 .extern	OPENSSL_ia32cap_P
 
-.globl	poly1305_init
-.hidden	poly1305_init
-.globl	poly1305_blocks
-.hidden	poly1305_blocks
-.globl	poly1305_emit
-.hidden	poly1305_emit
-
-.type	poly1305_init,\@function,3
-.align	32
-poly1305_init:
+.globl	poly1305_init_x86_64
+.hidden	poly1305_init_x86_64
+.globl	poly1305_blocks_x86_64
+.hidden	poly1305_blocks_x86_64
+.globl	poly1305_emit_x86_64
+.hidden	poly1305_emit_x86_64
+___
+&declare_function("poly1305_init_x86_64", 32, 3);
+$code.=<<___;
 	xor	%rax,%rax
 	mov	%rax,0($ctx)		# initialize hash value
 	mov	%rax,8($ctx)
@@ -175,11 +253,12 @@ poly1305_init:
 
 	cmp	\$0,$inp
 	je	.Lno_key
-
-	lea	poly1305_blocks(%rip),%r10
-	lea	poly1305_emit(%rip),%r11
 ___
-$code.=<<___	if ($avx);
+$code.=<<___ if (!$kernel);
+	lea	poly1305_blocks_x86_64(%rip),%r10
+	lea	poly1305_emit_x86_64(%rip),%r11
+___
+$code.=<<___	if (!$kernel && $avx);
 	mov	OPENSSL_ia32cap_P+4(%rip),%r9
 	lea	poly1305_blocks_avx(%rip),%rax
 	lea	poly1305_emit_avx(%rip),%rcx
@@ -187,12 +266,12 @@ $code.=<<___	if ($avx);
 	cmovc	%rax,%r10
 	cmovc	%rcx,%r11
 ___
-$code.=<<___	if ($avx>1);
+$code.=<<___	if (!$kernel && $avx>1);
 	lea	poly1305_blocks_avx2(%rip),%rax
 	bt	\$`5+32`,%r9		# AVX2?
 	cmovc	%rax,%r10
 ___
-$code.=<<___	if ($avx>3);
+$code.=<<___	if (!$kernel && $avx>3);
 	mov	\$`(1<<31|1<<21|1<<16)`,%rax
 	shr	\$32,%r9
 	and	%rax,%r9
@@ -207,11 +286,11 @@ $code.=<<___;
 	mov	%rax,24($ctx)
 	mov	%rcx,32($ctx)
 ___
-$code.=<<___	if ($flavour !~ /elf32/);
+$code.=<<___	if (!$kernel && $flavour !~ /elf32/);
 	mov	%r10,0(%rdx)
 	mov	%r11,8(%rdx)
 ___
-$code.=<<___	if ($flavour =~ /elf32/);
+$code.=<<___	if (!$kernel && $flavour =~ /elf32/);
 	mov	%r10d,0(%rdx)
 	mov	%r11d,4(%rdx)
 ___
@@ -219,11 +298,11 @@ $code.=<<___;
 	mov	\$1,%eax
 .Lno_key:
 	ret
-.size	poly1305_init,.-poly1305_init
+___
+&end_function("poly1305_init_x86_64");
 
-.type	poly1305_blocks,\@function,4
-.align	32
-poly1305_blocks:
+&declare_function("poly1305_blocks_x86_64", 32, 4);
+$code.=<<___;
 .cfi_startproc
 .Lblocks:
 	shr	\$4,$len
@@ -231,8 +310,6 @@ poly1305_blocks:
 
 	push	%rbx
 .cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
 	push	%r12
 .cfi_push	%r12
 	push	%r13
@@ -241,6 +318,8 @@ poly1305_blocks:
 .cfi_push	%r14
 	push	%r15
 .cfi_push	%r15
+	push	$ctx
+.cfi_push	$ctx
 .Lblocks_body:
 
 	mov	$len,%r15		# reassign $len
@@ -265,26 +344,29 @@ poly1305_blocks:
 	lea	16($inp),$inp
 	adc	$padbit,$h2
 ___
+
 	&poly1305_iteration();
+
 $code.=<<___;
 	mov	$r1,%rax
 	dec	%r15			# len-=16
 	jnz	.Loop
 
+	mov	0(%rsp),$ctx
+.cfi_restore	$ctx
+
 	mov	$h0,0($ctx)		# store hash value
 	mov	$h1,8($ctx)
 	mov	$h2,16($ctx)
 
-	mov	0(%rsp),%r15
+	mov	8(%rsp),%r15
 .cfi_restore	%r15
-	mov	8(%rsp),%r14
+	mov	16(%rsp),%r14
 .cfi_restore	%r14
-	mov	16(%rsp),%r13
+	mov	24(%rsp),%r13
 .cfi_restore	%r13
-	mov	24(%rsp),%r12
+	mov	32(%rsp),%r12
 .cfi_restore	%r12
-	mov	32(%rsp),%rbp
-.cfi_restore	%rbp
 	mov	40(%rsp),%rbx
 .cfi_restore	%rbx
 	lea	48(%rsp),%rsp
@@ -293,11 +375,11 @@ $code.=<<___;
 .Lblocks_epilogue:
 	ret
 .cfi_endproc
-.size	poly1305_blocks,.-poly1305_blocks
+___
+&end_function("poly1305_blocks_x86_64");
 
-.type	poly1305_emit,\@function,3
-.align	32
-poly1305_emit:
+&declare_function("poly1305_emit_x86_64", 32, 3);
+$code.=<<___;
 .Lemit:
 	mov	0($ctx),%r8	# load hash value
 	mov	8($ctx),%r9
@@ -318,10 +400,14 @@ poly1305_emit:
 	mov	%rcx,8($mac)
 
 	ret
-.size	poly1305_emit,.-poly1305_emit
 ___
+&end_function("poly1305_emit_x86_64");
 if ($avx) {
 
+if($kernel) {
+	$code .= "#ifdef CONFIG_AS_AVX\n";
+}
+
 ########################################################################
 # Layout of opaque area is following.
 #
@@ -342,15 +428,19 @@ $code.=<<___;
 .type	__poly1305_block,\@abi-omnipotent
 .align	32
 __poly1305_block:
+	push $ctx
 ___
 	&poly1305_iteration();
 $code.=<<___;
+	pop $ctx
 	ret
 .size	__poly1305_block,.-__poly1305_block
 
 .type	__poly1305_init_avx,\@abi-omnipotent
 .align	32
 __poly1305_init_avx:
+	push %rbp
+	mov %rsp,%rbp
 	mov	$r0,$h0
 	mov	$r1,$h1
 	xor	$h2,$h2
@@ -507,12 +597,13 @@ __poly1305_init_avx:
 	mov	$d1#d,`16*8+8-64`($ctx)
 
 	lea	-48-64($ctx),$ctx	# size [de-]optimization
+	pop %rbp
 	ret
 .size	__poly1305_init_avx,.-__poly1305_init_avx
+___
 
-.type	poly1305_blocks_avx,\@function,4
-.align	32
-poly1305_blocks_avx:
+&declare_function("poly1305_blocks_avx", 32, 4);
+$code.=<<___;
 .cfi_startproc
 	mov	20($ctx),%r8d		# is_base2_26
 	cmp	\$128,$len
@@ -532,10 +623,11 @@ poly1305_blocks_avx:
 	test	\$31,$len
 	jz	.Leven_avx
 
-	push	%rbx
-.cfi_push	%rbx
 	push	%rbp
 .cfi_push	%rbp
+	mov 	%rsp,%rbp
+	push	%rbx
+.cfi_push	%rbx
 	push	%r12
 .cfi_push	%r12
 	push	%r13
@@ -645,20 +737,18 @@ poly1305_blocks_avx:
 	mov	$h2#d,16($ctx)
 .align	16
 .Ldone_avx:
-	mov	0(%rsp),%r15
+	pop 		%r15
 .cfi_restore	%r15
-	mov	8(%rsp),%r14
+	pop 		%r14
 .cfi_restore	%r14
-	mov	16(%rsp),%r13
+	pop 		%r13
 .cfi_restore	%r13
-	mov	24(%rsp),%r12
+	pop 		%r12
 .cfi_restore	%r12
-	mov	32(%rsp),%rbp
-.cfi_restore	%rbp
-	mov	40(%rsp),%rbx
+	pop 		%rbx
 .cfi_restore	%rbx
-	lea	48(%rsp),%rsp
-.cfi_adjust_cfa_offset	-48
+	pop 		%rbp
+.cfi_restore	%rbp
 .Lno_data_avx:
 .Lblocks_avx_epilogue:
 	ret
@@ -667,10 +757,11 @@ poly1305_blocks_avx:
 .align	32
 .Lbase2_64_avx:
 .cfi_startproc
-	push	%rbx
-.cfi_push	%rbx
 	push	%rbp
 .cfi_push	%rbp
+	mov 	%rsp,%rbp
+	push	%rbx
+.cfi_push	%rbx
 	push	%r12
 .cfi_push	%r12
 	push	%r13
@@ -736,22 +827,18 @@ poly1305_blocks_avx:
 
 .Lproceed_avx:
 	mov	%r15,$len
-
-	mov	0(%rsp),%r15
+	pop 		%r15
 .cfi_restore	%r15
-	mov	8(%rsp),%r14
+	pop 		%r14
 .cfi_restore	%r14
-	mov	16(%rsp),%r13
+	pop 		%r13
 .cfi_restore	%r13
-	mov	24(%rsp),%r12
+	pop 		%r12
 .cfi_restore	%r12
-	mov	32(%rsp),%rbp
-.cfi_restore	%rbp
-	mov	40(%rsp),%rbx
+	pop 		%rbx
 .cfi_restore	%rbx
-	lea	48(%rsp),%rax
-	lea	48(%rsp),%rsp
-.cfi_adjust_cfa_offset	-48
+	pop 		%rbp
+.cfi_restore	%rbp
 .Lbase2_64_avx_epilogue:
 	jmp	.Ldo_avx
 .cfi_endproc
@@ -768,8 +855,11 @@ poly1305_blocks_avx:
 .Ldo_avx:
 ___
 $code.=<<___	if (!$win64);
+	lea		8(%rsp),%r10
+.cfi_def_cfa_register	%r10
+	and		\$-32,%rsp
+	sub		\$-8,%rsp
 	lea		-0x58(%rsp),%r11
-.cfi_def_cfa		%r11,0x60
 	sub		\$0x178,%rsp
 ___
 $code.=<<___	if ($win64);
@@ -1361,18 +1451,18 @@ $code.=<<___	if ($win64);
 .Ldo_avx_epilogue:
 ___
 $code.=<<___	if (!$win64);
-	lea		0x58(%r11),%rsp
-.cfi_def_cfa		%rsp,8
+	lea		-8(%r10),%rsp
+.cfi_def_cfa_register	%rsp
 ___
 $code.=<<___;
 	vzeroupper
 	ret
 .cfi_endproc
-.size	poly1305_blocks_avx,.-poly1305_blocks_avx
+___
+&end_function("poly1305_blocks_avx");
 
-.type	poly1305_emit_avx,\@function,3
-.align	32
-poly1305_emit_avx:
+&declare_function("poly1305_emit_avx", 32, 3);
+$code.=<<___;
 	cmpl	\$0,20($ctx)	# is_base2_26?
 	je	.Lemit
 
@@ -1423,41 +1513,51 @@ poly1305_emit_avx:
 	mov	%rcx,8($mac)
 
 	ret
-.size	poly1305_emit_avx,.-poly1305_emit_avx
 ___
+&end_function("poly1305_emit_avx");
+
+if ($kernel) {
+	$code .= "#endif\n";
+}
 
 if ($avx>1) {
+
+if ($kernel) {
+	$code .= "#ifdef CONFIG_AS_AVX2\n";
+}
+
 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
     map("%ymm$_",(0..15));
 my $S4=$MASK;
 
+sub poly1305_blocks_avxN {
+	my ($avx512) = @_;
+	my $suffix = $avx512 ? "_avx512" : "";
 $code.=<<___;
-.type	poly1305_blocks_avx2,\@function,4
-.align	32
-poly1305_blocks_avx2:
 .cfi_startproc
 	mov	20($ctx),%r8d		# is_base2_26
 	cmp	\$128,$len
-	jae	.Lblocks_avx2
+	jae	.Lblocks_avx2$suffix
 	test	%r8d,%r8d
 	jz	.Lblocks
 
-.Lblocks_avx2:
+.Lblocks_avx2$suffix:
 	and	\$-16,$len
-	jz	.Lno_data_avx2
+	jz	.Lno_data_avx2$suffix
 
 	vzeroupper
 
 	test	%r8d,%r8d
-	jz	.Lbase2_64_avx2
+	jz	.Lbase2_64_avx2$suffix
 
 	test	\$63,$len
-	jz	.Leven_avx2
+	jz	.Leven_avx2$suffix
 
-	push	%rbx
-.cfi_push	%rbx
 	push	%rbp
 .cfi_push	%rbp
+	mov 	%rsp,%rbp
+	push	%rbx
+.cfi_push	%rbx
 	push	%r12
 .cfi_push	%r12
 	push	%r13
@@ -1466,7 +1566,7 @@ poly1305_blocks_avx2:
 .cfi_push	%r14
 	push	%r15
 .cfi_push	%r15
-.Lblocks_avx2_body:
+.Lblocks_avx2_body$suffix:
 
 	mov	$len,%r15		# reassign $len
 
@@ -1513,7 +1613,7 @@ poly1305_blocks_avx2:
 	shr	\$2,$s1
 	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
 
-.Lbase2_26_pre_avx2:
+.Lbase2_26_pre_avx2$suffix:
 	add	0($inp),$h0		# accumulate input
 	adc	8($inp),$h1
 	lea	16($inp),$inp
@@ -1524,10 +1624,10 @@ poly1305_blocks_avx2:
 	mov	$r1,%rax
 
 	test	\$63,%r15
-	jnz	.Lbase2_26_pre_avx2
+	jnz	.Lbase2_26_pre_avx2$suffix
 
 	test	$padbit,$padbit		# if $padbit is zero,
-	jz	.Lstore_base2_64_avx2	# store hash in base 2^64 format
+	jz	.Lstore_base2_64_avx2$suffix	# store hash in base 2^64 format
 
 	################################# base 2^64 -> base 2^26
 	mov	$h0,%rax
@@ -1548,57 +1648,56 @@ poly1305_blocks_avx2:
 	or	$r1,$h2			# h[4]
 
 	test	%r15,%r15
-	jz	.Lstore_base2_26_avx2
+	jz	.Lstore_base2_26_avx2$suffix
 
 	vmovd	%rax#d,%x#$H0
 	vmovd	%rdx#d,%x#$H1
 	vmovd	$h0#d,%x#$H2
 	vmovd	$h1#d,%x#$H3
 	vmovd	$h2#d,%x#$H4
-	jmp	.Lproceed_avx2
+	jmp	.Lproceed_avx2$suffix
 
 .align	32
-.Lstore_base2_64_avx2:
+.Lstore_base2_64_avx2$suffix:
 	mov	$h0,0($ctx)
 	mov	$h1,8($ctx)
 	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
-	jmp	.Ldone_avx2
+	jmp	.Ldone_avx2$suffix
 
 .align	16
-.Lstore_base2_26_avx2:
+.Lstore_base2_26_avx2$suffix:
 	mov	%rax#d,0($ctx)		# store hash value base 2^26
 	mov	%rdx#d,4($ctx)
 	mov	$h0#d,8($ctx)
 	mov	$h1#d,12($ctx)
 	mov	$h2#d,16($ctx)
 .align	16
-.Ldone_avx2:
-	mov	0(%rsp),%r15
+.Ldone_avx2$suffix:
+	pop 		%r15
 .cfi_restore	%r15
-	mov	8(%rsp),%r14
+	pop 		%r14
 .cfi_restore	%r14
-	mov	16(%rsp),%r13
+	pop 		%r13
 .cfi_restore	%r13
-	mov	24(%rsp),%r12
+	pop 		%r12
 .cfi_restore	%r12
-	mov	32(%rsp),%rbp
-.cfi_restore	%rbp
-	mov	40(%rsp),%rbx
+	pop 		%rbx
 .cfi_restore	%rbx
-	lea	48(%rsp),%rsp
-.cfi_adjust_cfa_offset	-48
-.Lno_data_avx2:
-.Lblocks_avx2_epilogue:
+	pop 		%rbp
+.cfi_restore 	%rbp
+.Lno_data_avx2$suffix:
+.Lblocks_avx2_epilogue$suffix:
 	ret
 .cfi_endproc
 
 .align	32
-.Lbase2_64_avx2:
+.Lbase2_64_avx2$suffix:
 .cfi_startproc
-	push	%rbx
-.cfi_push	%rbx
 	push	%rbp
 .cfi_push	%rbp
+	mov 	%rsp,%rbp
+	push	%rbx
+.cfi_push	%rbx
 	push	%r12
 .cfi_push	%r12
 	push	%r13
@@ -1607,7 +1706,7 @@ poly1305_blocks_avx2:
 .cfi_push	%r14
 	push	%r15
 .cfi_push	%r15
-.Lbase2_64_avx2_body:
+.Lbase2_64_avx2_body$suffix:
 
 	mov	$len,%r15		# reassign $len
 
@@ -1624,9 +1723,9 @@ poly1305_blocks_avx2:
 	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
 
 	test	\$63,$len
-	jz	.Linit_avx2
+	jz	.Linit_avx2$suffix
 
-.Lbase2_64_pre_avx2:
+.Lbase2_64_pre_avx2$suffix:
 	add	0($inp),$h0		# accumulate input
 	adc	8($inp),$h1
 	lea	16($inp),$inp
@@ -1637,9 +1736,9 @@ poly1305_blocks_avx2:
 	mov	$r1,%rax
 
 	test	\$63,%r15
-	jnz	.Lbase2_64_pre_avx2
+	jnz	.Lbase2_64_pre_avx2$suffix
 
-.Linit_avx2:
+.Linit_avx2$suffix:
 	################################# base 2^64 -> base 2^26
 	mov	$h0,%rax
 	mov	$h0,%rdx
@@ -1667,69 +1766,77 @@ poly1305_blocks_avx2:
 
 	call	__poly1305_init_avx
 
-.Lproceed_avx2:
+.Lproceed_avx2$suffix:
 	mov	%r15,$len			# restore $len
-	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
+___
+$code.=<<___ if (!$kernel);
+	mov	OPENSSL_ia32cap_P+8(%rip),%r9d
 	mov	\$`(1<<31|1<<30|1<<16)`,%r11d
-
-	mov	0(%rsp),%r15
+___
+$code.=<<___;
+	pop 		%r15
 .cfi_restore	%r15
-	mov	8(%rsp),%r14
+	pop 		%r14
 .cfi_restore	%r14
-	mov	16(%rsp),%r13
+	pop 		%r13
 .cfi_restore	%r13
-	mov	24(%rsp),%r12
+	pop 		%r12
 .cfi_restore	%r12
-	mov	32(%rsp),%rbp
-.cfi_restore	%rbp
-	mov	40(%rsp),%rbx
+	pop 		%rbx
 .cfi_restore	%rbx
-	lea	48(%rsp),%rax
-	lea	48(%rsp),%rsp
-.cfi_adjust_cfa_offset	-48
-.Lbase2_64_avx2_epilogue:
-	jmp	.Ldo_avx2
+	pop 		%rbp
+.cfi_restore 	%rbp
+.Lbase2_64_avx2_epilogue$suffix:
+	jmp	.Ldo_avx2$suffix
 .cfi_endproc
 
 .align	32
-.Leven_avx2:
+.Leven_avx2$suffix:
 .cfi_startproc
-	mov		OPENSSL_ia32cap_P+8(%rip),%r10d
+___
+$code.=<<___ if (!$kernel);
+	mov		OPENSSL_ia32cap_P+8(%rip),%r9d
+___
+$code.=<<___;
 	vmovd		4*0($ctx),%x#$H0	# load hash value base 2^26
 	vmovd		4*1($ctx),%x#$H1
 	vmovd		4*2($ctx),%x#$H2
 	vmovd		4*3($ctx),%x#$H3
 	vmovd		4*4($ctx),%x#$H4
 
-.Ldo_avx2:
+.Ldo_avx2$suffix:
 ___
-$code.=<<___		if ($avx>2);
+$code.=<<___		if (!$kernel && $avx>2);
 	cmp		\$512,$len
 	jb		.Lskip_avx512
-	and		%r11d,%r10d
-	test		\$`1<<16`,%r10d		# check for AVX512F
+	and		%r11d,%r9d
+	test		\$`1<<16`,%r9d		# check for AVX512F
 	jnz		.Lblocks_avx512
-.Lskip_avx512:
+.Lskip_avx512$suffix:
+___
+$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
+	cmp		\$512,$len
+	jae		.Lblocks_avx512
 ___
 $code.=<<___	if (!$win64);
-	lea		-8(%rsp),%r11
-.cfi_def_cfa		%r11,16
+	lea		8(%rsp),%r10
+.cfi_def_cfa_register	%r10
 	sub		\$0x128,%rsp
 ___
 $code.=<<___	if ($win64);
-	lea		-0xf8(%rsp),%r11
+	lea		8(%rsp),%r10
 	sub		\$0x1c8,%rsp
-	vmovdqa		%xmm6,0x50(%r11)
-	vmovdqa		%xmm7,0x60(%r11)
-	vmovdqa		%xmm8,0x70(%r11)
-	vmovdqa		%xmm9,0x80(%r11)
-	vmovdqa		%xmm10,0x90(%r11)
-	vmovdqa		%xmm11,0xa0(%r11)
-	vmovdqa		%xmm12,0xb0(%r11)
-	vmovdqa		%xmm13,0xc0(%r11)
-	vmovdqa		%xmm14,0xd0(%r11)
-	vmovdqa		%xmm15,0xe0(%r11)
-.Ldo_avx2_body:
+	vmovdqa		%xmm6,-0xb0(%r10)
+	vmovdqa		%xmm7,-0xa0(%r10)
+	vmovdqa		%xmm8,-0x90(%r10)
+	vmovdqa		%xmm9,-0x80(%r10)
+	vmovdqa		%xmm10,-0x70(%r10)
+	vmovdqa		%xmm11,-0x60(%r10)
+	vmovdqa		%xmm12,-0x50(%r10)
+	vmovdqa		%xmm13,-0x40(%r10)
+	vmovdqa		%xmm14,-0x30(%r10)
+	vmovdqa		%xmm15,-0x20(%r10)
+.Ldo_avx2_body$suffix:
 ___
 $code.=<<___;
 	lea		.Lconst(%rip),%rcx
@@ -1794,11 +1901,11 @@ $code.=<<___;
 
 	vpaddq		$H2,$T2,$H2		# accumulate input
 	sub		\$64,$len
-	jz		.Ltail_avx2
-	jmp		.Loop_avx2
+	jz		.Ltail_avx2$suffix
+	jmp		.Loop_avx2$suffix
 
 .align	32
-.Loop_avx2:
+.Loop_avx2$suffix:
 	################################################################
 	# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
 	# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
@@ -1946,10 +2053,10 @@ $code.=<<___;
 	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
 
 	sub		\$64,$len
-	jnz		.Loop_avx2
+	jnz		.Loop_avx2$suffix
 
 	.byte		0x66,0x90
-.Ltail_avx2:
+.Ltail_avx2$suffix:
 	################################################################
 	# while above multiplications were by r^4 in all lanes, in last
 	# iteration we multiply least significant lane by r^4 and most
@@ -2087,37 +2194,29 @@ $code.=<<___;
 	vmovd		%x#$H4,`4*4-48-64`($ctx)
 ___
 $code.=<<___	if ($win64);
-	vmovdqa		0x50(%r11),%xmm6
-	vmovdqa		0x60(%r11),%xmm7
-	vmovdqa		0x70(%r11),%xmm8
-	vmovdqa		0x80(%r11),%xmm9
-	vmovdqa		0x90(%r11),%xmm10
-	vmovdqa		0xa0(%r11),%xmm11
-	vmovdqa		0xb0(%r11),%xmm12
-	vmovdqa		0xc0(%r11),%xmm13
-	vmovdqa		0xd0(%r11),%xmm14
-	vmovdqa		0xe0(%r11),%xmm15
-	lea		0xf8(%r11),%rsp
-.Ldo_avx2_epilogue:
+	vmovdqa		-0xb0(%r10),%xmm6
+	vmovdqa		-0xa0(%r10),%xmm7
+	vmovdqa		-0x90(%r10),%xmm8
+	vmovdqa		-0x80(%r10),%xmm9
+	vmovdqa		-0x70(%r10),%xmm10
+	vmovdqa		-0x60(%r10),%xmm11
+	vmovdqa		-0x50(%r10),%xmm12
+	vmovdqa		-0x40(%r10),%xmm13
+	vmovdqa		-0x30(%r10),%xmm14
+	vmovdqa		-0x20(%r10),%xmm15
+	lea		-8(%r10),%rsp
+.Ldo_avx2_epilogue$suffix:
 ___
 $code.=<<___	if (!$win64);
-	lea		8(%r11),%rsp
-.cfi_def_cfa		%rsp,8
+	lea		-8(%r10),%rsp
+.cfi_def_cfa_register	%rsp
 ___
 $code.=<<___;
 	vzeroupper
 	ret
 .cfi_endproc
-.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
 ___
-#######################################################################
-if ($avx>2) {
-# On entry we have input length divisible by 64. But since inner loop
-# processes 128 bytes per iteration, cases when length is not divisible
-# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
-# reason stack layout is kept identical to poly1305_blocks_avx2. If not
-# for this tail, we wouldn't have to even allocate stack frame...
-
+if($avx > 2 && $avx512) {
 my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
 my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
 my $PADBIT="%zmm30";
@@ -2128,32 +2227,29 @@ map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
 map(s/%y/%z/,($MASK));
 
 $code.=<<___;
-.type	poly1305_blocks_avx512,\@function,4
-.align	32
-poly1305_blocks_avx512:
 .cfi_startproc
 .Lblocks_avx512:
 	mov		\$15,%eax
 	kmovw		%eax,%k2
 ___
 $code.=<<___	if (!$win64);
-	lea		-8(%rsp),%r11
-.cfi_def_cfa		%r11,16
+	lea		8(%rsp),%r10
+.cfi_def_cfa_register	%r10
 	sub		\$0x128,%rsp
 ___
 $code.=<<___	if ($win64);
-	lea		-0xf8(%rsp),%r11
+	lea		8(%rsp),%r10
 	sub		\$0x1c8,%rsp
-	vmovdqa		%xmm6,0x50(%r11)
-	vmovdqa		%xmm7,0x60(%r11)
-	vmovdqa		%xmm8,0x70(%r11)
-	vmovdqa		%xmm9,0x80(%r11)
-	vmovdqa		%xmm10,0x90(%r11)
-	vmovdqa		%xmm11,0xa0(%r11)
-	vmovdqa		%xmm12,0xb0(%r11)
-	vmovdqa		%xmm13,0xc0(%r11)
-	vmovdqa		%xmm14,0xd0(%r11)
-	vmovdqa		%xmm15,0xe0(%r11)
+	vmovdqa		%xmm6,-0xb0(%r10)
+	vmovdqa		%xmm7,-0xa0(%r10)
+	vmovdqa		%xmm8,-0x90(%r10)
+	vmovdqa		%xmm9,-0x80(%r10)
+	vmovdqa		%xmm10,-0x70(%r10)
+	vmovdqa		%xmm11,-0x60(%r10)
+	vmovdqa		%xmm12,-0x50(%r10)
+	vmovdqa		%xmm13,-0x40(%r10)
+	vmovdqa		%xmm14,-0x30(%r10)
+	vmovdqa		%xmm15,-0x20(%r10)
 .Ldo_avx512_body:
 ___
 $code.=<<___;
@@ -2679,7 +2775,7 @@ $code.=<<___;
 
 	lea		0x90(%rsp),%rax		# size optimization for .Ltail_avx2
 	add		\$64,$len
-	jnz		.Ltail_avx2
+	jnz		.Ltail_avx2$suffix
 
 	vpsubq		$T2,$H2,$H2		# undo input accumulation
 	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
@@ -2690,29 +2786,61 @@ $code.=<<___;
 	vzeroall
 ___
 $code.=<<___	if ($win64);
-	movdqa		0x50(%r11),%xmm6
-	movdqa		0x60(%r11),%xmm7
-	movdqa		0x70(%r11),%xmm8
-	movdqa		0x80(%r11),%xmm9
-	movdqa		0x90(%r11),%xmm10
-	movdqa		0xa0(%r11),%xmm11
-	movdqa		0xb0(%r11),%xmm12
-	movdqa		0xc0(%r11),%xmm13
-	movdqa		0xd0(%r11),%xmm14
-	movdqa		0xe0(%r11),%xmm15
-	lea		0xf8(%r11),%rsp
+	movdqa		-0xb0(%r10),%xmm6
+	movdqa		-0xa0(%r10),%xmm7
+	movdqa		-0x90(%r10),%xmm8
+	movdqa		-0x80(%r10),%xmm9
+	movdqa		-0x70(%r10),%xmm10
+	movdqa		-0x60(%r10),%xmm11
+	movdqa		-0x50(%r10),%xmm12
+	movdqa		-0x40(%r10),%xmm13
+	movdqa		-0x30(%r10),%xmm14
+	movdqa		-0x20(%r10),%xmm15
+	lea		-8(%r10),%rsp
 .Ldo_avx512_epilogue:
 ___
 $code.=<<___	if (!$win64);
-	lea		8(%r11),%rsp
-.cfi_def_cfa		%rsp,8
+	lea		-8(%r10),%rsp
+.cfi_def_cfa_register	%rsp
 ___
 $code.=<<___;
 	ret
 .cfi_endproc
-.size	poly1305_blocks_avx512,.-poly1305_blocks_avx512
 ___
-if ($avx>3) {
+
+}
+
+}
+
+&declare_function("poly1305_blocks_avx2", 32, 4);
+poly1305_blocks_avxN(0);
+&end_function("poly1305_blocks_avx2");
+
+if($kernel) {
+	$code .= "#endif\n";
+}
+
+#######################################################################
+if ($avx>2) {
+# On entry we have input length divisible by 64. But since inner loop
+# processes 128 bytes per iteration, cases when length is not divisible
+# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
+# reason stack layout is kept identical to poly1305_blocks_avx2. If not
+# for this tail, we wouldn't have to even allocate stack frame...
+
+if($kernel) {
+	$code .= "#ifdef CONFIG_AS_AVX512\n";
+}
+
+&declare_function("poly1305_blocks_avx512", 32, 4);
+poly1305_blocks_avxN(1);
+&end_function("poly1305_blocks_avx512");
+
+if ($kernel) {
+	$code .= "#endif\n";
+}
+
+if (!$kernel && $avx>3) {
 ########################################################################
 # VPMADD52 version using 2^44 radix.
 #
@@ -3753,45 +3881,9 @@ poly1305_emit_base2_44:
 .size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
 ___
 }	}	}
-$code.=<<___;
-.align	64
-.Lconst:
-.Lmask24:
-.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
-.L129:
-.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
-.Lmask26:
-.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lpermd_avx2:
-.long	2,2,2,3,2,0,2,1
-.Lpermd_avx512:
-.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
-
-.L2_44_inp_permd:
-.long	0,1,1,2,2,3,7,7
-.L2_44_inp_shift:
-.quad	0,12,24,64
-.L2_44_mask:
-.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
-.L2_44_shift_rgt:
-.quad	44,44,42,64
-.L2_44_shift_lft:
-.quad	8,8,10,64
-
-.align	64
-.Lx_mask44:
-.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.Lx_mask42:
-.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-___
 }
-$code.=<<___;
-.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align	16
-___
 
+if (!$kernel)
 {	# chacha20-poly1305 helpers
 my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
                                   ("%rdi","%rsi","%rdx","%rcx");  # Unix order
@@ -4038,17 +4130,17 @@ avx_handler:
 
 .section	.pdata
 .align	4
-	.rva	.LSEH_begin_poly1305_init
-	.rva	.LSEH_end_poly1305_init
-	.rva	.LSEH_info_poly1305_init
+	.rva	.LSEH_begin_poly1305_init_x86_64
+	.rva	.LSEH_end_poly1305_init_x86_64
+	.rva	.LSEH_info_poly1305_init_x86_64
 
-	.rva	.LSEH_begin_poly1305_blocks
-	.rva	.LSEH_end_poly1305_blocks
-	.rva	.LSEH_info_poly1305_blocks
+	.rva	.LSEH_begin_poly1305_blocks_x86_64
+	.rva	.LSEH_end_poly1305_blocks_x86_64
+	.rva	.LSEH_info_poly1305_blocks_x86_64
 
-	.rva	.LSEH_begin_poly1305_emit
-	.rva	.LSEH_end_poly1305_emit
-	.rva	.LSEH_info_poly1305_emit
+	.rva	.LSEH_begin_poly1305_emit_x86_64
+	.rva	.LSEH_end_poly1305_emit_x86_64
+	.rva	.LSEH_info_poly1305_emit_x86_64
 ___
 $code.=<<___ if ($avx);
 	.rva	.LSEH_begin_poly1305_blocks_avx
@@ -4088,20 +4180,20 @@ ___
 $code.=<<___;
 .section	.xdata
 .align	8
-.LSEH_info_poly1305_init:
+.LSEH_info_poly1305_init_x86_64:
 	.byte	9,0,0,0
 	.rva	se_handler
-	.rva	.LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
+	.rva	.LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
 
-.LSEH_info_poly1305_blocks:
+.LSEH_info_poly1305_blocks_x86_64:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lblocks_body,.Lblocks_epilogue
 
-.LSEH_info_poly1305_emit:
+.LSEH_info_poly1305_emit_x86_64:
 	.byte	9,0,0,0
 	.rva	se_handler
-	.rva	.LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
+	.rva	.LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
 ___
 $code.=<<___ if ($avx);
 .LSEH_info_poly1305_blocks_avx_1:
@@ -4148,12 +4240,26 @@ $code.=<<___ if ($avx>2);
 ___
 }
 
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/\/\// and !/^$/);
+	print;
+}
+close SELF;
+
 foreach (split('\n',$code)) {
 	s/\`([^\`]*)\`/eval($1)/ge;
 	s/%r([a-z]+)#d/%e$1/g;
 	s/%r([0-9]+)#d/%r$1d/g;
 	s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
 
+	if ($kernel) {
+		s/(^\.type.*),[0-9]+$/\1/;
+		s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
+		next if /^\.cfi.*/;
+	}
+
 	print $_,"\n";
 }
 close STDOUT;
author	Jason A. Donenfeld <Jason@zx2c4.com>	2020-01-05 22:40:48 -0500
committer	Herbert Xu <herbert@gondor.apana.org.au>	2020-01-16 15:18:12 +0800
commit	d7d7b853566254648df59f7ea27ea05952a6cfa8 (patch)
tree	1dd0bf094b188627ba18438f51df753d5d969086 /arch/x86/crypto/poly1305-x86_64-cryptogams.pl
parent	0896ca2a0cb6127e8a129f1f2a680d49b6b0f65c (diff)
download	linux-d7d7b853566254648df59f7ea27ea05952a6cfa8.tar.gz