1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
|
;------------------------------------------------------------------------------
;
; Copyright (c) 2022, Intel Corporation. All rights reserved.<BR>
; SPDX-License-Identifier: BSD-2-Clause-Patent
;
; Abstract:
;
; Provide macro for register save/restore using SSE registers
;
;------------------------------------------------------------------------------
;
; Define SSE and AVX instruction set
;
;
; Define SSE macros using SSE 4.1 instructions
; args 1:XMM, 2:IDX, 3:REG
;
%macro SXMMN 3
pinsrq %1, %3, (%2 & 3)
%endmacro
;
; args 1:XMM, 2:REG, 3:IDX
;
%macro LXMMN 3
pextrq %2, %1, (%3 & 3)
%endmacro
;
; Define AVX macros using AVX instructions
; Save XMM to YMM
; args 1:YMM, 2:IDX (0 - lower 128bits, 1 - upper 128bits), 3:XMM
;
%macro SYMMN 3
vinsertf128 %1, %1, %3, %2
%endmacro
;
; Restore XMM from YMM
; args 1:YMM, 2:XMM, 3:IDX (0 - lower 128bits, 1 - upper 128bits)
;
%macro LYMMN 3
vextractf128 %2, %1, %3
%endmacro
;
; Upper half of YMM7 to save RBP and RBX. Upper half of YMM8 to save RSI and RDI.
; Modified: XMM5, YMM6, YMM7 and YMM8
;
%macro SAVE_REGS 0
SXMMN xmm5, 0, rbp
SXMMN xmm5, 1, rbx
SYMMN ymm7, 1, xmm5
SXMMN xmm5, 0, rsi
SXMMN xmm5, 1, rdi
SYMMN ymm8, 1, xmm5
SAVE_RSP
%endmacro
;
; Upper half of YMM7 to restore RBP and RBX. Upper half of YMM8 to restore RSI and RDI.
; Modified: XMM5, RBP, RBX, RSI, RDI and RSP
;
%macro LOAD_REGS 0
LYMMN ymm7, xmm5, 1
LXMMN xmm5, rbp, 0
LXMMN xmm5, rbx, 1
LYMMN ymm8, xmm5, 1
LXMMN xmm5, rsi, 0
LXMMN xmm5, rdi, 1
LOAD_RSP
%endmacro
;
; Restore RBP from YMM7[128:191]
; Modified: XMM5 and RBP
;
%macro LOAD_RBP 0
LYMMN ymm7, xmm5, 1
movq rbp, xmm5
%endmacro
;
; Restore RBX from YMM7[192:255]
; Modified: XMM5 and RBX
;
%macro LOAD_RBX 0
LYMMN ymm7, xmm5, 1
LXMMN xmm5, rbx, 1
%endmacro
;
; Upper half of YMM6 to save/restore Time Stamp, RSP
;
;
; Save Time Stamp to YMM6[192:255]
; arg 1:general purpose register which holds time stamp
; Modified: XMM5 and YMM6
;
%macro SAVE_TS 1
LYMMN ymm6, xmm5, 1
SXMMN xmm5, 1, %1
SYMMN ymm6, 1, xmm5
%endmacro
;
; Restore Time Stamp from YMM6[192:255]
; arg 1:general purpose register where to save time stamp
; Modified: XMM5 and %1
;
%macro LOAD_TS 1
LYMMN ymm6, xmm5, 1
LXMMN xmm5, %1, 1
%endmacro
;
; Save RSP to YMM6[128:191]
; Modified: XMM5 and YMM6
;
%macro SAVE_RSP 0
LYMMN ymm6, xmm5, 1
SXMMN xmm5, 0, rsp
SYMMN ymm6, 1, xmm5
%endmacro
;
; Restore RSP from YMM6[128:191]
; Modified: XMM5 and RSP
;
%macro LOAD_RSP 0
LYMMN ymm6, xmm5, 1
movq rsp, xmm5
%endmacro
;
; Upper half of YMM9 to save/restore UCODE status, BFV address
;
;
; Save uCode status to YMM9[192:255]
; arg 1:general purpose register which holds uCode status
; Modified: XMM5 and YMM9
;
%macro SAVE_UCODE_STATUS 1
LYMMN ymm9, xmm5, 1
SXMMN xmm5, 0, %1
SYMMN ymm9, 1, xmm5
%endmacro
;
; Restore uCode status from YMM9[192:255]
; arg 1:general purpose register where to save uCode status
; Modified: XMM5 and %1
;
%macro LOAD_UCODE_STATUS 1
LYMMN ymm9, xmm5, 1
movq %1, xmm5
%endmacro
;
; Save BFV address to YMM9[128:191]
; arg 1:general purpose register which holds BFV address
; Modified: XMM5 and YMM9
;
%macro SAVE_BFV 1
LYMMN ymm9, xmm5, 1
SXMMN xmm5, 1, %1
SYMMN ymm9, 1, xmm5
%endmacro
;
; Restore BFV address from YMM9[128:191]
; arg 1:general purpose register where to save BFV address
; Modified: XMM5 and %1
;
%macro LOAD_BFV 1
LYMMN ymm9, xmm5, 1
LXMMN xmm5, %1, 1
%endmacro
;
; Upper half of YMM10 to save/restore RCX
;
;
; Save RCX to YMM10[128:191]
; Modified: XMM5 and YMM10
;
%macro SAVE_RCX 0
LYMMN ymm10, xmm5, 1
SXMMN xmm5, 0, rcx
SYMMN ymm10, 1, xmm5
%endmacro
;
; Restore RCX from YMM10[128:191]
; Modified: XMM5 and RCX
;
%macro LOAD_RCX 0
LYMMN ymm10, xmm5, 1
movq rcx, xmm5
%endmacro
;
; Save TemporaryRamSize to YMM10[192:255]
; arg 1:general purpose register which holds TemporaryRamSize
; Modified: XMM5 and YMM10[192:255]
;
%macro SAVE_TEMPORARY_RAM_SIZE 1
LYMMN ymm10, xmm5, 1
SXMMN xmm5, 1, %1
SYMMN ymm10, 1, xmm5
%endmacro
;
; Restore TemporaryRamSize from YMM10[192:255]
; arg 1:general purpose register where to save TemporaryRamSize
; Modified: XMM5 and %1
;
%macro LOAD_TEMPORARY_RAM_SIZE 1
LYMMN ymm10, xmm5, 1
LXMMN xmm5, %1, 1
%endmacro
;
; YMM7[128:191] for calling stack
; arg 1:Entry
; Modified: RSI, XMM5, YMM7
;
%macro CALL_YMM 1
mov rsi, %%ReturnAddress
LYMMN ymm7, xmm5, 1
SXMMN xmm5, 0, rsi
SYMMN ymm7, 1, xmm5
mov rsi, %1
jmp rsi
%%ReturnAddress:
%endmacro
;
; Restore RIP from YMM7[128:191]
; Modified: RSI, XMM5
;
%macro RET_YMM 0
LYMMN ymm7, xmm5, 1
movq rsi, xmm5
jmp rsi
%endmacro
%macro ENABLE_SSE 0
;
; Initialize floating point units
;
jmp NextAddress
align 4
;
; Float control word initial value:
; all exceptions masked, double-precision, round-to-nearest
;
FpuControlWord DW 027Fh
;
; Multimedia-extensions control word:
; all exceptions masked, round-to-nearest, flush to zero for masked underflow
;
MmxControlWord DQ 01F80h
SseError:
;
; Processor has to support SSE
;
jmp SseError
NextAddress:
finit
mov rax, FpuControlWord
fldcw [rax]
;
; Use CpuId instruction (CPUID.01H:EDX.SSE[bit 25] = 1) to test
; whether the processor supports SSE instruction.
;
; Save RBX to R11
; Save RCX to R10
;
mov r11, rbx
mov r10, rcx
mov rax, 1
cpuid
bt rdx, 25
jnc SseError
;
; SSE 4.1 support
;
bt ecx, 19
jnc SseError
;
; Restore RBX from R11
; Restore RCX from R10
;
mov rbx, r11
mov rcx, r10
;
; Set OSFXSR bit (bit #9) & OSXMMEXCPT bit (bit #10)
;
mov rax, cr4
or rax, 00000600h
mov cr4, rax
;
; The processor should support SSE instruction and we can use
; ldmxcsr instruction
;
mov rax, MmxControlWord
ldmxcsr [rax]
%endmacro
%macro ENABLE_AVX 0
;
; Save RBX to R11
; Save RCX to R10
;
mov r11, rbx
mov r10, rcx
mov eax, 1
cpuid
and ecx, 10000000h
cmp ecx, 10000000h ; check AVX feature flag
je EnableAvx
AvxError:
;
; Processor has to support AVX
;
jmp AvxError
EnableAvx:
;
; Set OSXSAVE bit (bit #18) to enable xgetbv/xsetbv instruction
;
mov rax, cr4
or rax, 00040000h
mov cr4, rax
mov rcx, 0 ; index 0
xgetbv ; result in edx:eax
or eax, 00000006h ; Set XCR0 bit #1 and bit #2 to enable SSE state and AVX state
xsetbv
;
; Restore RBX from R11
; Restore RCX from R10
;
mov rbx, r11
mov rcx, r10
%endmacro
|