Skip to content

Commit 767c693

Browse files
committed
gosthash2012: Import SSE4.1 implementation
Link: https://github.com/adegtyarev/streebog Signed-off-by: Vitaly Chikunov <vt@altlinux.org>
1 parent 2e291a9 commit 767c693

File tree

4 files changed

+191
-3
lines changed

4 files changed

+191
-3
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ set(GOST_HASH_2012_SOURCE_FILES
128128
gosthash2012_precalc.h
129129
gosthash2012_ref.c
130130
gosthash2012_sse2.c
131+
gosthash2012_sse41.c
131132
)
132133

133134
set(GOST_GRASSHOPPER_SOURCE_FILES

gosthash2012.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,13 +115,21 @@ void g(union uint512_u *h, const union uint512_u * RESTRICT N,
115115
const union uint512_u * RESTRICT m)
116116
{
117117
#ifdef __GOST3411_DISPATCH__
118+
# if defined __GOST3411_HAS_SSE41__
119+
if (__builtin_cpu_supports("sse4.1"))
120+
return g_sse41(h, N, m);
121+
# endif
118122
# if defined __GOST3411_HAS_SSE2__
119123
if (__builtin_cpu_supports("sse2"))
120124
return g_sse2(h, N, m);
121-
# elif defined __GOST3411_HAS_REF__
125+
# endif
126+
# if defined __GOST3411_HAS_REF__
122127
g_ref(h, N, m);
123-
# else
124-
# error "No implementation of g() is selected."
128+
# endif
129+
# if !defined __GOST3411_HAS_SSE41__ && \
130+
!defined __GOST3411_HAS_SSE2__ && \
131+
!defined __GOST3411_HAS_REF__
132+
# error "No dynamic implementation of g() is selected."
125133
# endif
126134
#else /* !__GOST3411_DISPATCH__ */
127135
# if defined __GOST3411_HAS_SSE2__ && defined __SSE2__

gosthash2012.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#if defined __x86_64__ || defined __i386__
1717
# define __GOST3411_HAS_SSE2__
18+
# define __GOST3411_HAS_SSE41__
1819
#elif defined __SSE2__
1920
# define __GOST3411_HAS_SSE2__
2021
# if !defined __e2k__
@@ -112,3 +113,8 @@ _internal _target("sse2")
112113
void g_sse2(union uint512_u *h, const union uint512_u * RESTRICT N,
113114
const union uint512_u * RESTRICT m);
114115
#endif
116+
#ifdef __GOST3411_HAS_SSE41__
117+
_internal _target("sse4.1")
118+
void g_sse41(union uint512_u *h, const union uint512_u * RESTRICT N,
119+
const union uint512_u * RESTRICT m);
120+
#endif

gosthash2012_sse41.c

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
/*
2+
* Copyright (c) 2013, Alexey Degtyarev <alexey@renatasystems.org>.
3+
* All rights reserved.
4+
*
5+
* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0+
6+
*/
7+
8+
#include "gosthash2012.h"
9+
#ifdef __GOST3411_HAS_SSE41__
10+
11+
#include <mmintrin.h>
12+
#include <emmintrin.h>
13+
#include <smmintrin.h>
14+
15+
#ifdef __i386__
16+
#define EXTRACT EXTRACT32
17+
#else
18+
#define EXTRACT EXTRACT64
19+
#endif
20+
21+
#ifndef __ICC
22+
#define _mm_cvtsi64_m64(v) (__m64) v
23+
#define _mm_cvtm64_si64(v) (long long) v
24+
#endif
25+
26+
#define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \
27+
const __m128i *__m128p = (const __m128i *) &P[0]; \
28+
xmm0 = _mm_loadu_si128(&__m128p[0]); \
29+
xmm1 = _mm_loadu_si128(&__m128p[1]); \
30+
xmm2 = _mm_loadu_si128(&__m128p[2]); \
31+
xmm3 = _mm_loadu_si128(&__m128p[3]); \
32+
}
33+
34+
#define UNLOAD(P, xmm0, xmm1, xmm2, xmm3) { \
35+
__m128i *__m128p = (__m128i *) &P[0]; \
36+
_mm_store_si128(&__m128p[0], xmm0); \
37+
_mm_store_si128(&__m128p[1], xmm1); \
38+
_mm_store_si128(&__m128p[2], xmm2); \
39+
_mm_store_si128(&__m128p[3], xmm3); \
40+
}
41+
42+
#define X128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
43+
xmm0 = _mm_xor_si128(xmm0, xmm4); \
44+
xmm1 = _mm_xor_si128(xmm1, xmm5); \
45+
xmm2 = _mm_xor_si128(xmm2, xmm6); \
46+
xmm3 = _mm_xor_si128(xmm3, xmm7); \
47+
}
48+
49+
#define X128M(P, xmm0, xmm1, xmm2, xmm3) { \
50+
const __m128i *__m128p = (const __m128i *) &P[0]; \
51+
xmm0 = _mm_xor_si128(xmm0, _mm_loadu_si128(&__m128p[0])); \
52+
xmm1 = _mm_xor_si128(xmm1, _mm_loadu_si128(&__m128p[1])); \
53+
xmm2 = _mm_xor_si128(xmm2, _mm_loadu_si128(&__m128p[2])); \
54+
xmm3 = _mm_xor_si128(xmm3, _mm_loadu_si128(&__m128p[3])); \
55+
}
56+
57+
#define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1))
58+
59+
#define _mm_extract_char(src, ndx) (unsigned char) _mm_extract_epi8(src, ndx)
60+
61+
#define EXTRACT32(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
62+
__m64 mm0, mm1; \
63+
\
64+
mm0 = _mm_cvtsi64_m64(Ax[0][_mm_extract_char(xmm0, row + 0)]); \
65+
mm0 = _mm_xor_64(mm0, Ax[1][_mm_extract_char(xmm0, row + 8)]); \
66+
mm0 = _mm_xor_64(mm0, Ax[2][_mm_extract_char(xmm1, row + 0)]); \
67+
mm0 = _mm_xor_64(mm0, Ax[3][_mm_extract_char(xmm1, row + 8)]); \
68+
mm0 = _mm_xor_64(mm0, Ax[4][_mm_extract_char(xmm2, row + 0)]); \
69+
mm0 = _mm_xor_64(mm0, Ax[5][_mm_extract_char(xmm2, row + 8)]); \
70+
mm0 = _mm_xor_64(mm0, Ax[6][_mm_extract_char(xmm3, row + 0)]); \
71+
mm0 = _mm_xor_64(mm0, Ax[7][_mm_extract_char(xmm3, row + 8)]); \
72+
\
73+
mm1 = _mm_cvtsi64_m64(Ax[0][_mm_extract_char(xmm0, row + 1)]); \
74+
mm1 = _mm_xor_64(mm1, Ax[1][_mm_extract_char(xmm0, row + 9)]); \
75+
mm1 = _mm_xor_64(mm1, Ax[2][_mm_extract_char(xmm1, row + 1)]); \
76+
mm1 = _mm_xor_64(mm1, Ax[3][_mm_extract_char(xmm1, row + 9)]); \
77+
mm1 = _mm_xor_64(mm1, Ax[4][_mm_extract_char(xmm2, row + 1)]); \
78+
mm1 = _mm_xor_64(mm1, Ax[5][_mm_extract_char(xmm2, row + 9)]); \
79+
mm1 = _mm_xor_64(mm1, Ax[6][_mm_extract_char(xmm3, row + 1)]); \
80+
mm1 = _mm_xor_64(mm1, Ax[7][_mm_extract_char(xmm3, row + 9)]); \
81+
\
82+
xmm4 = _mm_set_epi64(mm1, mm0); \
83+
}
84+
85+
#define EXTRACT64(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
86+
register unsigned long long r0, r1; \
87+
r0 = Ax[0][_mm_extract_char(xmm0, row + 0)]; \
88+
r0 ^= Ax[1][_mm_extract_char(xmm0, row + 8)]; \
89+
r0 ^= Ax[2][_mm_extract_char(xmm1, row + 0)]; \
90+
r0 ^= Ax[3][_mm_extract_char(xmm1, row + 8)]; \
91+
r0 ^= Ax[4][_mm_extract_char(xmm2, row + 0)]; \
92+
r0 ^= Ax[5][_mm_extract_char(xmm2, row + 8)]; \
93+
r0 ^= Ax[6][_mm_extract_char(xmm3, row + 0)]; \
94+
r0 ^= Ax[7][_mm_extract_char(xmm3, row + 8)]; \
95+
\
96+
r1 = Ax[0][_mm_extract_char(xmm0, row + 1)]; \
97+
r1 ^= Ax[1][_mm_extract_char(xmm0, row + 9)]; \
98+
r1 ^= Ax[2][_mm_extract_char(xmm1, row + 1)]; \
99+
r1 ^= Ax[3][_mm_extract_char(xmm1, row + 9)]; \
100+
r1 ^= Ax[4][_mm_extract_char(xmm2, row + 1)]; \
101+
r1 ^= Ax[5][_mm_extract_char(xmm2, row + 9)]; \
102+
r1 ^= Ax[6][_mm_extract_char(xmm3, row + 1)]; \
103+
r1 ^= Ax[7][_mm_extract_char(xmm3, row + 9)]; \
104+
\
105+
xmm4 = _mm_cvtsi64_si128((long long) r0); \
106+
xmm4 = _mm_insert_epi64(xmm4, (long long) r1, 1); \
107+
}
108+
109+
#define XLPS128M(P, xmm0, xmm1, xmm2, xmm3) { \
110+
__m128i tmm0, tmm1, tmm2, tmm3; \
111+
X128M(P, xmm0, xmm1, xmm2, xmm3); \
112+
\
113+
EXTRACT(0, xmm0, xmm1, xmm2, xmm3, tmm0); \
114+
EXTRACT(2, xmm0, xmm1, xmm2, xmm3, tmm1); \
115+
EXTRACT(4, xmm0, xmm1, xmm2, xmm3, tmm2); \
116+
EXTRACT(6, xmm0, xmm1, xmm2, xmm3, tmm3); \
117+
\
118+
xmm0 = tmm0; \
119+
xmm1 = tmm1; \
120+
xmm2 = tmm2; \
121+
xmm3 = tmm3; \
122+
}
123+
124+
#define XLPS128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
125+
__m128i tmm0, tmm1, tmm2, tmm3; \
126+
X128R(xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3); \
127+
\
128+
EXTRACT(0, xmm4, xmm5, xmm6, xmm7, tmm0); \
129+
EXTRACT(2, xmm4, xmm5, xmm6, xmm7, tmm1); \
130+
EXTRACT(4, xmm4, xmm5, xmm6, xmm7, tmm2); \
131+
EXTRACT(6, xmm4, xmm5, xmm6, xmm7, tmm3); \
132+
\
133+
xmm4 = tmm0; \
134+
xmm5 = tmm1; \
135+
xmm6 = tmm2; \
136+
xmm7 = tmm3; \
137+
}
138+
139+
#define ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7) { \
140+
XLPS128M((&C[i]), xmm0, xmm2, xmm4, xmm6); \
141+
XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); \
142+
}
143+
144+
void g_sse41(union uint512_u *h, const union uint512_u * RESTRICT N,
145+
const union uint512_u * RESTRICT m)
146+
{
147+
__m128i xmm0, xmm2, xmm4, xmm6; /* XMMR0-quadruple */
148+
__m128i xmm1, xmm3, xmm5, xmm7; /* XMMR1-quadruple */
149+
unsigned int i;
150+
151+
LOAD(N, xmm0, xmm2, xmm4, xmm6);
152+
XLPS128M(h, xmm0, xmm2, xmm4, xmm6);
153+
154+
LOAD(m, xmm1, xmm3, xmm5, xmm7);
155+
XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7);
156+
157+
for (i = 0; i < 11; i++)
158+
ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7);
159+
160+
XLPS128M((&C[11]), xmm0, xmm2, xmm4, xmm6);
161+
X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7);
162+
163+
X128M(h, xmm0, xmm2, xmm4, xmm6);
164+
X128M(m, xmm0, xmm2, xmm4, xmm6);
165+
166+
UNLOAD(h, xmm0, xmm2, xmm4, xmm6);
167+
# ifdef __i386__
168+
/* Restore the Floating-point status on the CPU */
169+
/* This is only required on MMX, but EXTRACT32 is using MMX */
170+
_mm_empty();
171+
# endif
172+
}
173+
#endif /* __GOST3411_HAS_SSE41__ */

0 commit comments

Comments
 (0)