00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063 #include "pch.h"
00064
00065 #ifndef CRYPTOPP_IMPORTS
00066 #ifndef CRYPTOPP_GENERATE_X64_MASM
00067
00068 #include "rijndael.h"
00069 #include "misc.h"
00070 #include "cpu.h"
00071
00072 #ifdef __sun
00073 #include <alloca.h>
00074 #endif
00075
00076 #ifdef __MINGW32__
00077 #include <malloc.h>
00078 #endif
00079
00080 NAMESPACE_BEGIN(CryptoPP)
00081
00082 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00083 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00084 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
00085 using namespace rdtable;
00086 #else
00087 static word64 Te[256];
00088 #endif
00089 static word64 Td[256];
00090 #else
00091 static word32 Te[256*4], Td[256*4];
00092 #endif
00093 static volatile bool s_TeFilled = false, s_TdFilled = false;
00094
00095
00096
00097 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
00098 a ^= L(T, 3, byte(t)); t >>= 8;\
00099 b ^= L(T, 2, byte(t)); t >>= 8;\
00100 c ^= L(T, 1, byte(t)); t >>= 8;\
00101 d ^= L(T, 0, t);
00102
00103 #define QUARTER_ROUND_LE(t, a, b, c, d) \
00104 tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00105 tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00106 tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00107 tempBlock[d] = ((byte *)(Te+t))[1];
00108
00109 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00110 #define QUARTER_ROUND_LD(t, a, b, c, d) \
00111 tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00112 tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00113 tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00114 tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
00115 #else
00116 #define QUARTER_ROUND_LD(t, a, b, c, d) \
00117 tempBlock[a] = Sd[byte(t)]; t >>= 8;\
00118 tempBlock[b] = Sd[byte(t)]; t >>= 8;\
00119 tempBlock[c] = Sd[byte(t)]; t >>= 8;\
00120 tempBlock[d] = Sd[t];
00121 #endif
00122
00123 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
00124 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
00125
00126 #ifdef IS_LITTLE_ENDIAN
00127 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
00128 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
00129 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00130 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (6-i)%4+1))
00131 #define TL_M(T, i, x) (*(word32 *)((byte *)T + x*8 + (i+3)%4+1))
00132 #else
00133 #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
00134 #define TL_M(T, i, x) T[i*256 + x]
00135 #endif
00136 #else
00137 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
00138 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
00139 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00140 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (4-i)%4))
00141 #define TL_M TL_F
00142 #else
00143 #define TL_F(T, i, x) rotrFixed(T[x], i*8)
00144 #define TL_M(T, i, x) T[i*256 + x]
00145 #endif
00146 #endif
00147
00148
00149 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
00150 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
00151 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
00152
00153 #define f3(x) (f2(x) ^ x)
00154 #define f9(x) (f8(x) ^ x)
00155 #define fb(x) (f8(x) ^ f2(x) ^ x)
00156 #define fd(x) (f8(x) ^ f4(x) ^ x)
00157 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
00158
00159 void Rijndael::Base::FillEncTable()
00160 {
00161 for (int i=0; i<256; i++)
00162 {
00163 byte x = Se[i];
00164 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00165 word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
00166 Te[i] = word64(y | f3(x))<<32 | y;
00167 #else
00168 word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
00169 for (int j=0; j<4; j++)
00170 {
00171 Te[i+j*256] = y;
00172 y = rotrFixed(y, 8);
00173 }
00174 #endif
00175 }
00176 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00177 Te[256] = Te[257] = 0;
00178 #endif
00179 s_TeFilled = true;
00180 }
00181
00182 void Rijndael::Base::FillDecTable()
00183 {
00184 for (int i=0; i<256; i++)
00185 {
00186 byte x = Sd[i];
00187 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00188 word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
00189 Td[i] = word64(y | fb(x))<<32 | y | x;
00190 #else
00191 word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
00192 for (int j=0; j<4; j++)
00193 {
00194 Td[i+j*256] = y;
00195 y = rotrFixed(y, 8);
00196 }
00197 #endif
00198 }
00199 s_TdFilled = true;
00200 }
00201
00202 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
00203 {
00204 AssertValidKeyLength(keylen);
00205
00206 m_rounds = keylen/4 + 6;
00207 m_key.New(4*(m_rounds+1));
00208
00209 word32 temp, *rk = m_key;
00210 const word32 *rc = rcon;
00211
00212 GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
00213
00214 while (true)
00215 {
00216 temp = rk[keylen/4-1];
00217 rk[keylen/4] = rk[0] ^
00218 (word32(Se[GETBYTE(temp, 2)]) << 24) ^
00219 (word32(Se[GETBYTE(temp, 1)]) << 16) ^
00220 (word32(Se[GETBYTE(temp, 0)]) << 8) ^
00221 Se[GETBYTE(temp, 3)] ^
00222 *(rc++);
00223 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
00224 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
00225 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
00226
00227 if (rk + keylen/4 + 4 == m_key.end())
00228 break;
00229
00230 if (keylen == 24)
00231 {
00232 rk[10] = rk[ 4] ^ rk[ 9];
00233 rk[11] = rk[ 5] ^ rk[10];
00234 }
00235 else if (keylen == 32)
00236 {
00237 temp = rk[11];
00238 rk[12] = rk[ 4] ^
00239 (word32(Se[GETBYTE(temp, 3)]) << 24) ^
00240 (word32(Se[GETBYTE(temp, 2)]) << 16) ^
00241 (word32(Se[GETBYTE(temp, 1)]) << 8) ^
00242 Se[GETBYTE(temp, 0)];
00243 rk[13] = rk[ 5] ^ rk[12];
00244 rk[14] = rk[ 6] ^ rk[13];
00245 rk[15] = rk[ 7] ^ rk[14];
00246 }
00247 rk += keylen/4;
00248 }
00249
00250 if (IsForwardTransformation())
00251 {
00252 if (!s_TeFilled)
00253 FillEncTable();
00254 }
00255 else
00256 {
00257 if (!s_TdFilled)
00258 FillDecTable();
00259
00260 unsigned int i, j;
00261 rk = m_key;
00262
00263
00264 for (i = 0, j = 4*m_rounds; i < j; i += 4, j -= 4) {
00265 temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
00266 temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
00267 temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
00268 temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
00269 }
00270
00271 #define InverseMixColumn(x) x = TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
00272
00273
00274 for (i = 1; i < m_rounds; i++) {
00275 rk += 4;
00276 InverseMixColumn(rk[0]);
00277 InverseMixColumn(rk[1]);
00278 InverseMixColumn(rk[2]);
00279 InverseMixColumn(rk[3]);
00280 }
00281 }
00282
00283 ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key.begin(), m_key.begin(), 16);
00284 ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16);
00285 }
00286
00287 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
00288 {
00289 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00290 if (HasSSE2())
00291 {
00292 Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
00293 return;
00294 }
00295 #endif
00296
00297 typedef BlockGetAndPut<word32, NativeByteOrder> Block;
00298
00299 word32 s0, s1, s2, s3, t0, t1, t2, t3;
00300 Block::Get(inBlock)(s0)(s1)(s2)(s3);
00301
00302 const word32 *rk = m_key;
00303 s0 ^= rk[0];
00304 s1 ^= rk[1];
00305 s2 ^= rk[2];
00306 s3 ^= rk[3];
00307 t0 = rk[4];
00308 t1 = rk[5];
00309 t2 = rk[6];
00310 t3 = rk[7];
00311 rk += 8;
00312
00313
00314 const int cacheLineSize = GetCacheLineSize();
00315 unsigned int i;
00316 word32 u = 0;
00317 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00318 for (i=0; i<2048; i+=cacheLineSize)
00319 #else
00320 for (i=0; i<1024; i+=cacheLineSize)
00321 #endif
00322 u &= *(const word32 *)(((const byte *)Te)+i);
00323 u &= Te[255];
00324 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
00325
00326 QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
00327 QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
00328 QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
00329 QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
00330
00331
00332 unsigned int r = m_rounds/2 - 1;
00333 do
00334 {
00335 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
00336
00337 QUARTER_ROUND_E(t3, s0, s1, s2, s3)
00338 QUARTER_ROUND_E(t2, s3, s0, s1, s2)
00339 QUARTER_ROUND_E(t1, s2, s3, s0, s1)
00340 QUARTER_ROUND_E(t0, s1, s2, s3, s0)
00341
00342 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
00343
00344 QUARTER_ROUND_E(s3, t0, t1, t2, t3)
00345 QUARTER_ROUND_E(s2, t3, t0, t1, t2)
00346 QUARTER_ROUND_E(s1, t2, t3, t0, t1)
00347 QUARTER_ROUND_E(s0, t1, t2, t3, t0)
00348
00349 rk += 8;
00350 } while (--r);
00351
00352 word32 tbw[4];
00353 byte *const tempBlock = (byte *)tbw;
00354
00355 QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
00356 QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
00357 QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
00358 QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
00359
00360 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
00361 }
00362
00363 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
00364 {
00365 typedef BlockGetAndPut<word32, NativeByteOrder> Block;
00366
00367 word32 s0, s1, s2, s3, t0, t1, t2, t3;
00368 Block::Get(inBlock)(s0)(s1)(s2)(s3);
00369
00370 const word32 *rk = m_key;
00371 s0 ^= rk[0];
00372 s1 ^= rk[1];
00373 s2 ^= rk[2];
00374 s3 ^= rk[3];
00375 t0 = rk[4];
00376 t1 = rk[5];
00377 t2 = rk[6];
00378 t3 = rk[7];
00379 rk += 8;
00380
00381
00382 const int cacheLineSize = GetCacheLineSize();
00383 unsigned int i;
00384 word32 u = 0;
00385 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00386 for (i=0; i<2048; i+=cacheLineSize)
00387 #else
00388 for (i=0; i<1024; i+=cacheLineSize)
00389 #endif
00390 u &= *(const word32 *)(((const byte *)Td)+i);
00391 u &= Td[255];
00392 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
00393
00394 QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
00395 QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
00396 QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
00397 QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
00398
00399
00400 unsigned int r = m_rounds/2 - 1;
00401 do
00402 {
00403 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
00404
00405 QUARTER_ROUND_D(t3, s2, s1, s0, s3)
00406 QUARTER_ROUND_D(t2, s1, s0, s3, s2)
00407 QUARTER_ROUND_D(t1, s0, s3, s2, s1)
00408 QUARTER_ROUND_D(t0, s3, s2, s1, s0)
00409
00410 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
00411
00412 QUARTER_ROUND_D(s3, t2, t1, t0, t3)
00413 QUARTER_ROUND_D(s2, t1, t0, t3, t2)
00414 QUARTER_ROUND_D(s1, t0, t3, t2, t1)
00415 QUARTER_ROUND_D(s0, t3, t2, t1, t0)
00416
00417 rk += 8;
00418 } while (--r);
00419
00420 #ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00421
00422
00423
00424 u = 0;
00425 for (i=0; i<256; i+=cacheLineSize)
00426 u &= *(const word32 *)(Sd+i);
00427 u &= *(const word32 *)(Sd+252);
00428 t0 |= u; t1 |= u; t2 |= u; t3 |= u;
00429 #endif
00430
00431 word32 tbw[4];
00432 byte *const tempBlock = (byte *)tbw;
00433
00434 QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
00435 QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
00436 QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
00437 QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
00438
00439 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
00440 }
00441
00442
00443
00444 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
00445
00446 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
00447
00448 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00449
00450 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
00451 {
00452 #if CRYPTOPP_BOOL_X86
00453
00454 #define L_REG esp
00455 #define L_INDEX(i) (L_REG+512+i)
00456 #define L_INXORBLOCKS L_INBLOCKS+4
00457 #define L_OUTXORBLOCKS L_INBLOCKS+8
00458 #define L_OUTBLOCKS L_INBLOCKS+12
00459 #define L_INCREMENTS L_INDEX(16*15)
00460 #define L_SP L_INDEX(16*16)
00461 #define L_LENGTH L_INDEX(16*16+4)
00462 #define L_KEYS_BEGIN L_INDEX(16*16+8)
00463
00464 #define MOVD movd
00465 #define MM(i) mm##i
00466
00467 #define MXOR(a,b,c) \
00468 AS2( movzx esi, b)\
00469 AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00470 AS2( pxor MM(a), mm7)\
00471
00472 #define MMOV(a,b,c) \
00473 AS2( movzx esi, b)\
00474 AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00475
00476 #else
00477
00478 #define L_REG r8
00479 #define L_INDEX(i) (L_REG+i)
00480 #define L_INXORBLOCKS L_INBLOCKS+8
00481 #define L_OUTXORBLOCKS L_INBLOCKS+16
00482 #define L_OUTBLOCKS L_INBLOCKS+24
00483 #define L_INCREMENTS L_INDEX(16*16)
00484 #define L_LENGTH L_INDEX(16*18+8)
00485 #define L_KEYS_BEGIN L_INDEX(16*19)
00486
00487 #define MOVD mov
00488 #define MM_0 r9d
00489 #define MM_1 r12d
00490 #ifdef __GNUC__
00491 #define MM_2 r11d
00492 #else
00493 #define MM_2 r10d
00494 #endif
00495 #define MM(i) MM_##i
00496
00497 #define MXOR(a,b,c) \
00498 AS2( movzx esi, b)\
00499 AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00500
00501 #define MMOV(a,b,c) \
00502 AS2( movzx esi, b)\
00503 AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00504
00505 #endif
00506
00507 #define L_SUBKEYS L_INDEX(0)
00508 #define L_SAVED_X L_SUBKEYS
00509 #define L_KEY12 L_INDEX(16*12)
00510 #define L_LASTROUND L_INDEX(16*13)
00511 #define L_INBLOCKS L_INDEX(16*14)
00512 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
00513
00514 #define XOR(a,b,c) \
00515 AS2( movzx esi, b)\
00516 AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00517
00518 #define MOV(a,b,c) \
00519 AS2( movzx esi, b)\
00520 AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00521
00522 #ifdef CRYPTOPP_GENERATE_X64_MASM
00523 ALIGN 8
00524 Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
00525 rex_push_reg rsi
00526 push_reg rdi
00527 push_reg rbx
00528 push_reg r12
00529 .endprolog
00530 mov L_REG, rcx
00531 mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
00532 mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
00533 #elif defined(__GNUC__)
00534 __asm__ __volatile__
00535 (
00536 ".intel_syntax noprefix;"
00537 #if CRYPTOPP_BOOL_X64
00538 AS2( mov L_REG, rcx)
00539 #endif
00540 AS_PUSH_IF86(bx)
00541 AS_PUSH_IF86(bp)
00542 AS2( mov AS_REG_7, WORD_REG(si))
00543 #else
00544 AS_PUSH_IF86(si)
00545 AS_PUSH_IF86(di)
00546 AS_PUSH_IF86(bx)
00547 AS_PUSH_IF86(bp)
00548 AS2( lea AS_REG_7, [Te])
00549 AS2( mov edi, [g_cacheLineSize])
00550 #endif
00551
00552 #if CRYPTOPP_BOOL_X86
00553 AS2( mov [ecx+16*12+16*4], esp)
00554 AS2( lea esp, [ecx-512])
00555 #endif
00556
00557
00558 AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
00559 AS2( mov WORD_REG(ax), 16)
00560 AS2( and WORD_REG(ax), WORD_REG(si))
00561 AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)])
00562 AS2( movdqa [L_KEY12], xmm3)
00563 AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
00564 AS2( sub WORD_REG(ax), WORD_REG(si))
00565 ASL(0)
00566 AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
00567 AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
00568 AS2( add WORD_REG(si), 16)
00569 AS2( cmp WORD_REG(si), 16*12)
00570 ASJ( jl, 0, b)
00571
00572
00573 AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)])
00574 AS2( movdqa xmm1, [WORD_REG(dx)])
00575 AS2( MOVD MM(1), [WORD_REG(dx)+4*4])
00576 AS2( mov ebx, [WORD_REG(dx)+5*4])
00577 AS2( mov ecx, [WORD_REG(dx)+6*4])
00578 AS2( mov edx, [WORD_REG(dx)+7*4])
00579
00580
00581 AS2( xor WORD_REG(ax), WORD_REG(ax))
00582 ASL(9)
00583 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00584 AS2( add WORD_REG(ax), WORD_REG(di))
00585 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00586 AS2( add WORD_REG(ax), WORD_REG(di))
00587 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00588 AS2( add WORD_REG(ax), WORD_REG(di))
00589 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00590 AS2( add WORD_REG(ax), WORD_REG(di))
00591 AS2( cmp WORD_REG(ax), 2048)
00592 ASJ( jl, 9, b)
00593 AS1( lfence)
00594
00595 AS2( test DWORD PTR [L_LENGTH], 1)
00596 ASJ( jz, 8, f)
00597
00598
00599 AS2( mov WORD_REG(si), [L_INBLOCKS])
00600 AS2( movdqu xmm2, [WORD_REG(si)])
00601 AS2( pxor xmm2, xmm1)
00602 AS2( psrldq xmm1, 14)
00603 AS2( movd eax, xmm1)
00604 AS2( mov al, BYTE PTR [WORD_REG(si)+15])
00605 AS2( MOVD MM(2), eax)
00606 #if CRYPTOPP_BOOL_X86
00607 AS2( mov eax, 1)
00608 AS2( movd mm3, eax)
00609 #endif
00610
00611
00612 AS2( movd eax, xmm2)
00613 AS2( psrldq xmm2, 4)
00614 AS2( movd edi, xmm2)
00615 AS2( psrldq xmm2, 4)
00616 MXOR( 1, al, 0)
00617 XOR( edx, ah, 1)
00618 AS2( shr eax, 16)
00619 XOR( ecx, al, 2)
00620 XOR( ebx, ah, 3)
00621 AS2( mov eax, edi)
00622 AS2( movd edi, xmm2)
00623 AS2( psrldq xmm2, 4)
00624 XOR( ebx, al, 0)
00625 MXOR( 1, ah, 1)
00626 AS2( shr eax, 16)
00627 XOR( edx, al, 2)
00628 XOR( ecx, ah, 3)
00629 AS2( mov eax, edi)
00630 AS2( movd edi, xmm2)
00631 XOR( ecx, al, 0)
00632 XOR( ebx, ah, 1)
00633 AS2( shr eax, 16)
00634 MXOR( 1, al, 2)
00635 XOR( edx, ah, 3)
00636 AS2( mov eax, edi)
00637 XOR( edx, al, 0)
00638 XOR( ecx, ah, 1)
00639 AS2( shr eax, 16)
00640 XOR( ebx, al, 2)
00641 AS2( psrldq xmm2, 3)
00642
00643
00644 AS2( mov eax, [L_KEY12+0*4])
00645 AS2( mov edi, [L_KEY12+2*4])
00646 AS2( MOVD MM(0), [L_KEY12+3*4])
00647 MXOR( 0, cl, 3)
00648 XOR( edi, bl, 3)
00649 MXOR( 0, bh, 2)
00650 AS2( shr ebx, 16)
00651 XOR( eax, bl, 1)
00652 MOV( ebx, bh, 0)
00653 AS2( xor ebx, [L_KEY12+1*4])
00654 XOR( eax, ch, 2)
00655 AS2( shr ecx, 16)
00656 XOR( eax, dl, 3)
00657 XOR( ebx, dh, 2)
00658 AS2( shr edx, 16)
00659 XOR( edi, ch, 0)
00660 XOR( ebx, cl, 1)
00661 XOR( edi, dl, 1)
00662 MXOR( 0, dh, 0)
00663
00664 AS2( movd ecx, xmm2)
00665 AS2( MOVD edx, MM(1))
00666 AS2( MOVD [L_SAVED_X+3*4], MM(0))
00667 AS2( mov [L_SAVED_X+0*4], eax)
00668 AS2( mov [L_SAVED_X+1*4], ebx)
00669 AS2( mov [L_SAVED_X+2*4], edi)
00670 ASJ( jmp, 5, f)
00671
00672 ASL(3)
00673
00674 AS2( MOVD MM(1), [L_KEY12+0*4])
00675 AS2( mov ebx, [L_KEY12+1*4])
00676 AS2( mov ecx, [L_KEY12+2*4])
00677 AS2( mov edx, [L_KEY12+3*4])
00678 ASL(8)
00679 AS2( mov WORD_REG(ax), [L_INBLOCKS])
00680 AS2( movdqu xmm2, [WORD_REG(ax)])
00681 AS2( mov WORD_REG(si), [L_INXORBLOCKS])
00682 AS2( movdqu xmm5, [WORD_REG(si)])
00683 AS2( pxor xmm2, xmm1)
00684 AS2( pxor xmm2, xmm5)
00685
00686
00687 AS2( movd eax, xmm2)
00688 AS2( psrldq xmm2, 4)
00689 AS2( movd edi, xmm2)
00690 AS2( psrldq xmm2, 4)
00691 MXOR( 1, al, 0)
00692 XOR( edx, ah, 1)
00693 AS2( shr eax, 16)
00694 XOR( ecx, al, 2)
00695 XOR( ebx, ah, 3)
00696 AS2( mov eax, edi)
00697 AS2( movd edi, xmm2)
00698 AS2( psrldq xmm2, 4)
00699 XOR( ebx, al, 0)
00700 MXOR( 1, ah, 1)
00701 AS2( shr eax, 16)
00702 XOR( edx, al, 2)
00703 XOR( ecx, ah, 3)
00704 AS2( mov eax, edi)
00705 AS2( movd edi, xmm2)
00706 XOR( ecx, al, 0)
00707 XOR( ebx, ah, 1)
00708 AS2( shr eax, 16)
00709 MXOR( 1, al, 2)
00710 XOR( edx, ah, 3)
00711 AS2( mov eax, edi)
00712 XOR( edx, al, 0)
00713 XOR( ecx, ah, 1)
00714 AS2( shr eax, 16)
00715 XOR( ebx, al, 2)
00716 MXOR( 1, ah, 3)
00717 AS2( MOVD eax, MM(1))
00718
00719 AS2( add L_REG, [L_KEYS_BEGIN])
00720 AS2( add L_REG, 4*16)
00721 ASJ( jmp, 2, f)
00722
00723 ASL(1)
00724
00725 AS2( MOVD ecx, MM(2))
00726 AS2( MOVD edx, MM(1))
00727 AS2( mov eax, [L_SAVED_X+0*4])
00728 AS2( mov ebx, [L_SAVED_X+1*4])
00729 AS2( xor cl, ch)
00730 AS2( and WORD_REG(cx), 255)
00731 ASL(5)
00732 #if CRYPTOPP_BOOL_X86
00733 AS2( paddb MM(2), mm3)
00734 #else
00735 AS2( add MM(2), 1)
00736 #endif
00737
00738 AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
00739 XOR( ebx, dl, 3)
00740 MOV( ecx, dh, 2)
00741 AS2( shr edx, 16)
00742 AS2( xor ecx, [L_SAVED_X+2*4])
00743 XOR( eax, dh, 0)
00744 MOV( edx, dl, 1)
00745 AS2( xor edx, [L_SAVED_X+3*4])
00746
00747 AS2( add L_REG, [L_KEYS_BEGIN])
00748 AS2( add L_REG, 3*16)
00749 ASJ( jmp, 4, f)
00750
00751
00752
00753 #define ROUND() \
00754 MXOR( 0, cl, 3) \
00755 AS2( mov cl, al) \
00756 XOR( edi, ah, 2) \
00757 AS2( shr eax, 16) \
00758 XOR( edi, bl, 3) \
00759 MXOR( 0, bh, 2) \
00760 AS2( shr ebx, 16) \
00761 MXOR( 0, al, 1) \
00762 MOV( eax, ah, 0) \
00763 XOR( eax, bl, 1) \
00764 MOV( ebx, bh, 0) \
00765 XOR( eax, ch, 2) \
00766 XOR( ebx, cl, 3) \
00767 AS2( shr ecx, 16) \
00768 XOR( eax, dl, 3) \
00769 XOR( ebx, dh, 2) \
00770 AS2( shr edx, 16) \
00771 XOR( edi, ch, 0) \
00772 XOR( ebx, cl, 1) \
00773 XOR( edi, dl, 1) \
00774 MXOR( 0, dh, 0) \
00775
00776 ASL(2)
00777 AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
00778 AS2( mov edi, [L_SUBKEYS-4*16+2*4])
00779 ROUND()
00780 AS2( mov ecx, edi)
00781 AS2( xor eax, [L_SUBKEYS-4*16+0*4])
00782 AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
00783 AS2( MOVD edx, MM(0))
00784
00785 ASL(4)
00786 AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
00787 AS2( mov edi, [L_SUBKEYS-4*16+6*4])
00788 ROUND()
00789 AS2( mov ecx, edi)
00790 AS2( xor eax, [L_SUBKEYS-4*16+4*4])
00791 AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
00792 AS2( MOVD edx, MM(0))
00793
00794 AS2( add L_REG, 32)
00795 AS2( test L_REG, 255)
00796 ASJ( jnz, 2, b)
00797 AS2( sub L_REG, 16*16)
00798
00799 #define LAST(a, b, c) \
00800 AS2( movzx esi, a )\
00801 AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
00802 AS2( movzx esi, b )\
00803 AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
00804 AS2( mov WORD PTR [L_LASTROUND+c], di )\
00805
00806
00807 LAST(ch, dl, 2)
00808 LAST(dh, al, 6)
00809 AS2( shr edx, 16)
00810 LAST(ah, bl, 10)
00811 AS2( shr eax, 16)
00812 LAST(bh, cl, 14)
00813 AS2( shr ebx, 16)
00814 LAST(dh, al, 12)
00815 AS2( shr ecx, 16)
00816 LAST(ah, bl, 0)
00817 LAST(bh, cl, 4)
00818 LAST(ch, dl, 8)
00819
00820 AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
00821 AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
00822
00823 AS2( mov WORD_REG(cx), [L_LENGTH])
00824 AS2( sub WORD_REG(cx), 16)
00825
00826 AS2( movdqu xmm2, [WORD_REG(ax)])
00827 AS2( pxor xmm2, xmm4)
00828
00829 #if CRYPTOPP_BOOL_X86
00830 AS2( movdqa xmm0, [L_INCREMENTS])
00831 AS2( paddd xmm0, [L_INBLOCKS])
00832 AS2( movdqa [L_INBLOCKS], xmm0)
00833 #else
00834 AS2( movdqa xmm0, [L_INCREMENTS+16])
00835 AS2( paddq xmm0, [L_INBLOCKS+16])
00836 AS2( movdqa [L_INBLOCKS+16], xmm0)
00837 #endif
00838
00839 AS2( pxor xmm2, [L_LASTROUND])
00840 AS2( movdqu [WORD_REG(bx)], xmm2)
00841
00842 ASJ( jle, 7, f)
00843 AS2( mov [L_LENGTH], WORD_REG(cx))
00844 AS2( test WORD_REG(cx), 1)
00845 ASJ( jnz, 1, b)
00846 #if CRYPTOPP_BOOL_X64
00847 AS2( movdqa xmm0, [L_INCREMENTS])
00848 AS2( paddq xmm0, [L_INBLOCKS])
00849 AS2( movdqa [L_INBLOCKS], xmm0)
00850 #endif
00851 ASJ( jmp, 3, b)
00852
00853 ASL(7)
00854
00855 AS2( xorps xmm0, xmm0)
00856 AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
00857 AS2( movaps [WORD_REG(ax)-7*16], xmm0)
00858 AS2( movaps [WORD_REG(ax)-6*16], xmm0)
00859 AS2( movaps [WORD_REG(ax)-5*16], xmm0)
00860 AS2( movaps [WORD_REG(ax)-4*16], xmm0)
00861 AS2( movaps [WORD_REG(ax)-3*16], xmm0)
00862 AS2( movaps [WORD_REG(ax)-2*16], xmm0)
00863 AS2( movaps [WORD_REG(ax)-1*16], xmm0)
00864 AS2( movaps [WORD_REG(ax)+0*16], xmm0)
00865 AS2( movaps [WORD_REG(ax)+1*16], xmm0)
00866 AS2( movaps [WORD_REG(ax)+2*16], xmm0)
00867 AS2( movaps [WORD_REG(ax)+3*16], xmm0)
00868 AS2( movaps [WORD_REG(ax)+4*16], xmm0)
00869 AS2( movaps [WORD_REG(ax)+5*16], xmm0)
00870 AS2( movaps [WORD_REG(ax)+6*16], xmm0)
00871 #if CRYPTOPP_BOOL_X86
00872 AS2( mov esp, [L_SP])
00873 AS1( emms)
00874 #endif
00875 AS_POP_IF86(bp)
00876 AS_POP_IF86(bx)
00877 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
00878 AS_POP_IF86(di)
00879 AS_POP_IF86(si)
00880 AS1(ret)
00881 #endif
00882 #ifdef CRYPTOPP_GENERATE_X64_MASM
00883 pop r12
00884 pop rbx
00885 pop rdi
00886 pop rsi
00887 ret
00888 Rijndael_Enc_AdvancedProcessBlocks ENDP
00889 #endif
00890 #ifdef __GNUC__
00891 ".att_syntax prefix;"
00892 :
00893 : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
00894 : "memory", "cc", "%eax"
00895 #if CRYPTOPP_BOOL_X64
00896 , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
00897 #endif
00898 );
00899 #endif
00900 }
00901
00902 #endif
00903
00904 #ifndef CRYPTOPP_GENERATE_X64_MASM
00905
00906 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00907 extern "C" {
00908 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
00909 }
00910 #endif
00911
00912 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86
00913
00914 static inline bool AliasedWithTable(const byte *begin, const byte *end)
00915 {
00916 size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
00917 size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
00918 if (t1 > t0)
00919 return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
00920 else
00921 return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
00922 }
00923
00924 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
00925 {
00926 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00927 if (length < BLOCKSIZE)
00928 return length;
00929
00930 if (HasSSE2())
00931 {
00932 struct Locals
00933 {
00934 word32 subkeys[4*12], workspace[8];
00935 const byte *inBlocks, *inXorBlocks, *outXorBlocks;
00936 byte *outBlocks;
00937 size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
00938 size_t regSpill, lengthAndCounterFlag, keysBegin;
00939 };
00940
00941 size_t increment = BLOCKSIZE;
00942 const byte* zeros = (byte *)(Te+256);
00943 byte *space;
00944
00945 do {
00946 space = (byte *)alloca(255+sizeof(Locals));
00947 space += (256-(size_t)space%256)%256;
00948 }
00949 while (AliasedWithTable(space, space+sizeof(Locals)));
00950
00951 if (flags & BT_ReverseDirection)
00952 {
00953 assert(length % BLOCKSIZE == 0);
00954 inBlocks += length - BLOCKSIZE;
00955 xorBlocks += length - BLOCKSIZE;
00956 outBlocks += length - BLOCKSIZE;
00957 increment = 0-increment;
00958 }
00959
00960 Locals &locals = *(Locals *)space;
00961
00962 locals.inBlocks = inBlocks;
00963 locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
00964 locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
00965 locals.outBlocks = outBlocks;
00966
00967 locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
00968 locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
00969 locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
00970 locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
00971
00972 locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
00973 int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
00974 locals.keysBegin = (12-keysToCopy)*16;
00975
00976 Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
00977 return length%16;
00978 }
00979 else
00980 #endif
00981 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
00982 }
00983
00984 #endif
00985
00986 NAMESPACE_END
00987
00988 #endif
00989 #endif