lib/connections: Use our own fork of kcp (fixes #4063)
This updates kcp and uses our own fork which: 1. Keys sessions not just by remote address, but by remote address + conversation id 2. Allows not to close connections that were passed directly to the library. 3. Resets cache key if the session gets terminated. GitHub-Pull-Request: https://github.com/syncthing/syncthing/pull/4339 LGTM: calmh
This commit is contained in:
committed by
Jakob Borg
parent
ab132ff6fe
commit
cbcc3ea132
21
vendor/github.com/templexxx/xor/LICENSE
generated
vendored
Normal file
21
vendor/github.com/templexxx/xor/LICENSE
generated
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2017 Temple3x
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
442
vendor/github.com/templexxx/xor/avx2_amd64.s
generated
vendored
Normal file
442
vendor/github.com/templexxx/xor/avx2_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,442 @@
|
||||
#include "textflag.h"
|
||||
|
||||
// addr of mem
|
||||
#define DST BX
|
||||
#define SRC SI
|
||||
#define SRC0 TMP4
|
||||
#define SRC1 TMP5
|
||||
|
||||
// loop args
|
||||
// num of vect
|
||||
#define VECT CX
|
||||
#define LEN DX
|
||||
// pos of matrix
|
||||
#define POS R8
|
||||
|
||||
// tmp store
|
||||
// num of vect or ...
|
||||
#define TMP1 R9
|
||||
// pos of matrix or ...
|
||||
#define TMP2 R10
|
||||
// store addr of data/parity or ...
|
||||
#define TMP3 R11
|
||||
#define TMP4 R12
|
||||
#define TMP5 R13
|
||||
#define TMP6 R14
|
||||
|
||||
// func bytesAVX2mini(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesAVX2mini(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $31, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop32b:
|
||||
VMOVDQU (SRC0)(POS*1), Y0
|
||||
VPXOR (SRC1)(POS*1), Y0, Y0
|
||||
VMOVDQU Y0, (DST)(POS*1)
|
||||
ADDQ $32, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop32b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $31, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $31, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $32
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func bytesAVX2small(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesAVX2small(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $127, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop128b:
|
||||
VMOVDQU (SRC0)(POS*1), Y0
|
||||
VMOVDQU 32(SRC0)(POS*1), Y1
|
||||
VMOVDQU 64(SRC0)(POS*1), Y2
|
||||
VMOVDQU 96(SRC0)(POS*1), Y3
|
||||
VPXOR (SRC1)(POS*1), Y0, Y0
|
||||
VPXOR 32(SRC1)(POS*1), Y1, Y1
|
||||
VPXOR 64(SRC1)(POS*1), Y2, Y2
|
||||
VPXOR 96(SRC1)(POS*1), Y3, Y3
|
||||
VMOVDQU Y0, (DST)(POS*1)
|
||||
VMOVDQU Y1, 32(DST)(POS*1)
|
||||
VMOVDQU Y2, 64(DST)(POS*1)
|
||||
VMOVDQU Y3, 96(DST)(POS*1)
|
||||
|
||||
ADDQ $128, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop128b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $127, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $127, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $128
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func bytesAVX2big(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesAVX2big(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $127, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop128b:
|
||||
VMOVDQU (SRC0)(POS*1), Y0
|
||||
VMOVDQU 32(SRC0)(POS*1), Y1
|
||||
VMOVDQU 64(SRC0)(POS*1), Y2
|
||||
VMOVDQU 96(SRC0)(POS*1), Y3
|
||||
VPXOR (SRC1)(POS*1), Y0, Y0
|
||||
VPXOR 32(SRC1)(POS*1), Y1, Y1
|
||||
VPXOR 64(SRC1)(POS*1), Y2, Y2
|
||||
VPXOR 96(SRC1)(POS*1), Y3, Y3
|
||||
LONG $0xe77da1c4; WORD $0x0304
|
||||
LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
|
||||
LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
|
||||
LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
|
||||
|
||||
ADDQ $128, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop128b
|
||||
SFENCE
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $127, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $127, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $128
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func matrixAVX2small(dst []byte, src [][]byte)
|
||||
TEXT ·matrixAVX2small(SB), NOSPLIT, $0
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src+24(FP), SRC
|
||||
MOVQ vec+32(FP), VECT
|
||||
MOVQ len+8(FP), LEN
|
||||
TESTQ $127, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop128b:
|
||||
MOVQ VECT, TMP1
|
||||
SUBQ $2, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
VMOVDQU (TMP3)(POS*1), Y0
|
||||
VMOVDQU 32(TMP4)(POS*1), Y1
|
||||
VMOVDQU 64(TMP3)(POS*1), Y2
|
||||
VMOVDQU 96(TMP4)(POS*1), Y3
|
||||
|
||||
next_vect:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
VMOVDQU (TMP3)(POS*1), Y4
|
||||
VMOVDQU 32(TMP4)(POS*1), Y5
|
||||
VMOVDQU 64(TMP3)(POS*1), Y6
|
||||
VMOVDQU 96(TMP4)(POS*1), Y7
|
||||
VPXOR Y4, Y0, Y0
|
||||
VPXOR Y5, Y1, Y1
|
||||
VPXOR Y6, Y2, Y2
|
||||
VPXOR Y7, Y3, Y3
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect
|
||||
|
||||
VMOVDQU Y0, (DST)(POS*1)
|
||||
VMOVDQU Y1, 32(DST)(POS*1)
|
||||
VMOVDQU Y2, 64(DST)(POS*1)
|
||||
VMOVDQU Y3, 96(DST)(POS*1)
|
||||
|
||||
ADDQ $128, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop128b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVB -1(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_1b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVB -1(TMP3)(LEN*1), TMP6
|
||||
XORB TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_1b
|
||||
|
||||
MOVB TMP5, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $127, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP4
|
||||
ANDQ $127, TMP4
|
||||
|
||||
loop_8b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVQ -8(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_8b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ -8(TMP3)(LEN*1), TMP6
|
||||
XORQ TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_8b
|
||||
|
||||
MOVQ TMP5, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP4
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $128
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func matrixAVX2big(dst []byte, src [][]byte)
|
||||
TEXT ·matrixAVX2big(SB), NOSPLIT, $0
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src+24(FP), SRC
|
||||
MOVQ vec+32(FP), VECT
|
||||
MOVQ len+8(FP), LEN
|
||||
TESTQ $127, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop128b:
|
||||
MOVQ VECT, TMP1
|
||||
SUBQ $2, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
VMOVDQU (TMP3)(POS*1), Y0
|
||||
VMOVDQU 32(TMP4)(POS*1), Y1
|
||||
VMOVDQU 64(TMP3)(POS*1), Y2
|
||||
VMOVDQU 96(TMP4)(POS*1), Y3
|
||||
|
||||
next_vect:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
VMOVDQU (TMP3)(POS*1), Y4
|
||||
VMOVDQU 32(TMP4)(POS*1), Y5
|
||||
VMOVDQU 64(TMP3)(POS*1), Y6
|
||||
VMOVDQU 96(TMP4)(POS*1), Y7
|
||||
VPXOR Y4, Y0, Y0
|
||||
VPXOR Y5, Y1, Y1
|
||||
VPXOR Y6, Y2, Y2
|
||||
VPXOR Y7, Y3, Y3
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect
|
||||
|
||||
LONG $0xe77da1c4; WORD $0x0304 // VMOVNTDQ go1.8 has
|
||||
LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
|
||||
LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
|
||||
LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
|
||||
|
||||
ADDQ $128, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop128b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVB -1(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_1b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVB -1(TMP3)(LEN*1), TMP6
|
||||
XORB TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_1b
|
||||
|
||||
MOVB TMP5, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $127, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP4
|
||||
ANDQ $127, TMP4
|
||||
|
||||
loop_8b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVQ -8(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_8b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ -8(TMP3)(LEN*1), TMP6
|
||||
XORQ TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_8b
|
||||
|
||||
MOVQ TMP5, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP4
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $128
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
TEXT ·hasAVX2(SB), NOSPLIT, $0
|
||||
XORQ AX, AX
|
||||
XORQ CX, CX
|
||||
ADDL $7, AX
|
||||
CPUID
|
||||
SHRQ $5, BX
|
||||
ANDQ $1, BX
|
||||
MOVB BX, ret+0(FP)
|
||||
RET
|
||||
116
vendor/github.com/templexxx/xor/nosimd.go
generated
vendored
Normal file
116
vendor/github.com/templexxx/xor/nosimd.go
generated
vendored
Normal file
@@ -0,0 +1,116 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package xor
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
const wordSize = int(unsafe.Sizeof(uintptr(0)))
|
||||
const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
|
||||
|
||||
// xor the bytes in a and b. The destination is assumed to have enough space.
|
||||
func bytesNoSIMD(dst, a, b []byte, size int) {
|
||||
if supportsUnaligned {
|
||||
fastXORBytes(dst, a, b, size)
|
||||
} else {
|
||||
// TODO(hanwen): if (dst, a, b) have common alignment
|
||||
// we could still try fastXORBytes. It is not clear
|
||||
// how often this happens, and it's only worth it if
|
||||
// the block encryption itself is hardware
|
||||
// accelerated.
|
||||
safeXORBytes(dst, a, b, size)
|
||||
}
|
||||
}
|
||||
|
||||
// split slice for cache-friendly
|
||||
const unitSize = 16 * 1024
|
||||
|
||||
func matrixNoSIMD(dst []byte, src [][]byte) {
|
||||
size := len(src[0])
|
||||
start := 0
|
||||
do := unitSize
|
||||
for start < size {
|
||||
end := start + do
|
||||
if end <= size {
|
||||
partNoSIMD(start, end, dst, src)
|
||||
start = start + do
|
||||
} else {
|
||||
partNoSIMD(start, size, dst, src)
|
||||
start = size
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// split vect will improve performance with big data by reducing cache pollution
|
||||
func partNoSIMD(start, end int, dst []byte, src [][]byte) {
|
||||
bytesNoSIMD(dst[start:end], src[0][start:end], src[1][start:end], end-start)
|
||||
for i := 2; i < len(src); i++ {
|
||||
bytesNoSIMD(dst[start:end], dst[start:end], src[i][start:end], end-start)
|
||||
}
|
||||
}
|
||||
|
||||
// fastXORBytes xor in bulk. It only works on architectures that
|
||||
// support unaligned read/writes.
|
||||
func fastXORBytes(dst, a, b []byte, n int) {
|
||||
w := n / wordSize
|
||||
if w > 0 {
|
||||
wordBytes := w * wordSize
|
||||
fastXORWords(dst[:wordBytes], a[:wordBytes], b[:wordBytes])
|
||||
}
|
||||
for i := n - n%wordSize; i < n; i++ {
|
||||
dst[i] = a[i] ^ b[i]
|
||||
}
|
||||
}
|
||||
|
||||
func safeXORBytes(dst, a, b []byte, n int) {
|
||||
ex := n % 8
|
||||
for i := 0; i < ex; i++ {
|
||||
dst[i] = a[i] ^ b[i]
|
||||
}
|
||||
|
||||
for i := ex; i < n; i += 8 {
|
||||
_dst := dst[i : i+8]
|
||||
_a := a[i : i+8]
|
||||
_b := b[i : i+8]
|
||||
_dst[0] = _a[0] ^ _b[0]
|
||||
_dst[1] = _a[1] ^ _b[1]
|
||||
_dst[2] = _a[2] ^ _b[2]
|
||||
_dst[3] = _a[3] ^ _b[3]
|
||||
|
||||
_dst[4] = _a[4] ^ _b[4]
|
||||
_dst[5] = _a[5] ^ _b[5]
|
||||
_dst[6] = _a[6] ^ _b[6]
|
||||
_dst[7] = _a[7] ^ _b[7]
|
||||
}
|
||||
}
|
||||
|
||||
// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
|
||||
// The arguments are assumed to be of equal length.
|
||||
func fastXORWords(dst, a, b []byte) {
|
||||
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
|
||||
aw := *(*[]uintptr)(unsafe.Pointer(&a))
|
||||
bw := *(*[]uintptr)(unsafe.Pointer(&b))
|
||||
n := len(b) / wordSize
|
||||
ex := n % 8
|
||||
for i := 0; i < ex; i++ {
|
||||
dw[i] = aw[i] ^ bw[i]
|
||||
}
|
||||
|
||||
for i := ex; i < n; i += 8 {
|
||||
_dw := dw[i : i+8]
|
||||
_aw := aw[i : i+8]
|
||||
_bw := bw[i : i+8]
|
||||
_dw[0] = _aw[0] ^ _bw[0]
|
||||
_dw[1] = _aw[1] ^ _bw[1]
|
||||
_dw[2] = _aw[2] ^ _bw[2]
|
||||
_dw[3] = _aw[3] ^ _bw[3]
|
||||
_dw[4] = _aw[4] ^ _bw[4]
|
||||
_dw[5] = _aw[5] ^ _bw[5]
|
||||
_dw[6] = _aw[6] ^ _bw[6]
|
||||
_dw[7] = _aw[7] ^ _bw[7]
|
||||
}
|
||||
}
|
||||
574
vendor/github.com/templexxx/xor/sse2_amd64.s
generated
vendored
Normal file
574
vendor/github.com/templexxx/xor/sse2_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,574 @@
|
||||
#include "textflag.h"
|
||||
|
||||
// addr of mem
|
||||
#define DST BX
|
||||
#define SRC SI
|
||||
#define SRC0 TMP4
|
||||
#define SRC1 TMP5
|
||||
|
||||
// loop args
|
||||
// num of vect
|
||||
#define VECT CX
|
||||
#define LEN DX
|
||||
// pos of matrix
|
||||
#define POS R8
|
||||
|
||||
// tmp store
|
||||
// num of vect or ...
|
||||
#define TMP1 R9
|
||||
// pos of matrix or ...
|
||||
#define TMP2 R10
|
||||
// store addr of data/parity or ...
|
||||
#define TMP3 R11
|
||||
#define TMP4 R12
|
||||
#define TMP5 R13
|
||||
#define TMP6 R14
|
||||
|
||||
// func bytesSrc0(dst, src0, src1 []byte)
|
||||
TEXT ·xorSrc0(SB), NOSPLIT, $0
|
||||
MOVQ len+32(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $15, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop16b:
|
||||
MOVOU (SRC0)(POS*1), X0
|
||||
XORPD (SRC1)(POS*1), X0
|
||||
MOVOU X0, (DST)(POS*1)
|
||||
ADDQ $16, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop16b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $15, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $15, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $16
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func bytesSrc1(dst, src0, src1 []byte)
|
||||
TEXT ·xorSrc1(SB), NOSPLIT, $0
|
||||
MOVQ len+56(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $15, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop16b:
|
||||
MOVOU (SRC0)(POS*1), X0
|
||||
XORPD (SRC1)(POS*1), X0
|
||||
MOVOU X0, (DST)(POS*1)
|
||||
ADDQ $16, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop16b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $15, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $15, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $16
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func bytesSSE2mini(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesSSE2mini(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $15, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop16b:
|
||||
MOVOU (SRC0)(POS*1), X0
|
||||
XORPD (SRC1)(POS*1), X0
|
||||
|
||||
// MOVOU (SRC1)(POS*1), X4
|
||||
// PXOR X4, X0
|
||||
MOVOU X0, (DST)(POS*1)
|
||||
ADDQ $16, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop16b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $15, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $15, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $16
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func bytesSSE2small(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesSSE2small(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $63, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop64b:
|
||||
MOVOU (SRC0)(POS*1), X0
|
||||
MOVOU 16(SRC0)(POS*1), X1
|
||||
MOVOU 32(SRC0)(POS*1), X2
|
||||
MOVOU 48(SRC0)(POS*1), X3
|
||||
|
||||
MOVOU (SRC1)(POS*1), X4
|
||||
MOVOU 16(SRC1)(POS*1), X5
|
||||
MOVOU 32(SRC1)(POS*1), X6
|
||||
MOVOU 48(SRC1)(POS*1), X7
|
||||
|
||||
PXOR X4, X0
|
||||
PXOR X5, X1
|
||||
PXOR X6, X2
|
||||
PXOR X7, X3
|
||||
|
||||
MOVOU X0, (DST)(POS*1)
|
||||
MOVOU X1, 16(DST)(POS*1)
|
||||
MOVOU X2, 32(DST)(POS*1)
|
||||
MOVOU X3, 48(DST)(POS*1)
|
||||
|
||||
ADDQ $64, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop64b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $63, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $63, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $64
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func bytesSSE2big(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesSSE2big(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $63, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop64b:
|
||||
MOVOU (SRC0)(POS*1), X0
|
||||
MOVOU 16(SRC0)(POS*1), X1
|
||||
MOVOU 32(SRC0)(POS*1), X2
|
||||
MOVOU 48(SRC0)(POS*1), X3
|
||||
|
||||
MOVOU (SRC1)(POS*1), X4
|
||||
MOVOU 16(SRC1)(POS*1), X5
|
||||
MOVOU 32(SRC1)(POS*1), X6
|
||||
MOVOU 48(SRC1)(POS*1), X7
|
||||
|
||||
PXOR X4, X0
|
||||
PXOR X5, X1
|
||||
PXOR X6, X2
|
||||
PXOR X7, X3
|
||||
|
||||
LONG $0xe70f4266; WORD $0x0304 // MOVNTDQ
|
||||
LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
|
||||
LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
|
||||
LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
|
||||
|
||||
ADDQ $64, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop64b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $63, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $63, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $64
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func matrixSSE2small(dst []byte, src [][]byte)
|
||||
TEXT ·matrixSSE2small(SB), NOSPLIT, $0
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src+24(FP), SRC
|
||||
MOVQ vec+32(FP), VECT
|
||||
MOVQ len+8(FP), LEN
|
||||
TESTQ $63, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop64b:
|
||||
MOVQ VECT, TMP1
|
||||
SUBQ $2, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
MOVOU (TMP3)(POS*1), X0
|
||||
MOVOU 16(TMP4)(POS*1), X1
|
||||
MOVOU 32(TMP3)(POS*1), X2
|
||||
MOVOU 48(TMP4)(POS*1), X3
|
||||
|
||||
next_vect:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
MOVOU (TMP3)(POS*1), X4
|
||||
MOVOU 16(TMP4)(POS*1), X5
|
||||
MOVOU 32(TMP3)(POS*1), X6
|
||||
MOVOU 48(TMP4)(POS*1), X7
|
||||
PXOR X4, X0
|
||||
PXOR X5, X1
|
||||
PXOR X6, X2
|
||||
PXOR X7, X3
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect
|
||||
|
||||
MOVOU X0, (DST)(POS*1)
|
||||
MOVOU X1, 16(DST)(POS*1)
|
||||
MOVOU X2, 32(DST)(POS*1)
|
||||
MOVOU X3, 48(DST)(POS*1)
|
||||
|
||||
ADDQ $64, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop64b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVB -1(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_1b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVB -1(TMP3)(LEN*1), TMP6
|
||||
XORB TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_1b
|
||||
|
||||
MOVB TMP5, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $63, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP4
|
||||
ANDQ $63, TMP4
|
||||
|
||||
loop_8b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVQ -8(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_8b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ -8(TMP3)(LEN*1), TMP6
|
||||
XORQ TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_8b
|
||||
|
||||
MOVQ TMP5, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP4
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $64
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func matrixSSE2big(dst []byte, src [][]byte)
|
||||
TEXT ·matrixSSE2big(SB), NOSPLIT, $0
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src+24(FP), SRC
|
||||
MOVQ vec+32(FP), VECT
|
||||
MOVQ len+8(FP), LEN
|
||||
TESTQ $63, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop64b:
|
||||
MOVQ VECT, TMP1
|
||||
SUBQ $2, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
MOVOU (TMP3)(POS*1), X0
|
||||
MOVOU 16(TMP4)(POS*1), X1
|
||||
MOVOU 32(TMP3)(POS*1), X2
|
||||
MOVOU 48(TMP4)(POS*1), X3
|
||||
|
||||
next_vect:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
MOVOU (TMP3)(POS*1), X4
|
||||
MOVOU 16(TMP4)(POS*1), X5
|
||||
MOVOU 32(TMP3)(POS*1), X6
|
||||
MOVOU 48(TMP4)(POS*1), X7
|
||||
PXOR X4, X0
|
||||
PXOR X5, X1
|
||||
PXOR X6, X2
|
||||
PXOR X7, X3
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect
|
||||
|
||||
LONG $0xe70f4266; WORD $0x0304
|
||||
LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
|
||||
LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
|
||||
LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
|
||||
|
||||
ADDQ $64, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop64b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVB -1(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_1b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVB -1(TMP3)(LEN*1), TMP6
|
||||
XORB TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_1b
|
||||
|
||||
MOVB TMP5, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $63, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP4
|
||||
ANDQ $63, TMP4
|
||||
|
||||
loop_8b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVQ -8(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_8b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ -8(TMP3)(LEN*1), TMP6
|
||||
XORQ TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_8b
|
||||
|
||||
MOVQ TMP5, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP4
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $64
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
TEXT ·hasSSE2(SB), NOSPLIT, $0
|
||||
XORQ AX, AX
|
||||
INCL AX
|
||||
CPUID
|
||||
SHRQ $26, DX
|
||||
ANDQ $1, DX
|
||||
MOVB DX, ret+0(FP)
|
||||
RET
|
||||
|
||||
49
vendor/github.com/templexxx/xor/xor.go
generated
vendored
Normal file
49
vendor/github.com/templexxx/xor/xor.go
generated
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
package xor
|
||||
|
||||
// SIMD Extensions
|
||||
const (
|
||||
none = iota
|
||||
avx2
|
||||
// first introduced by Intel with the initial version of the Pentium 4 in 2001
|
||||
// so I think we can assume all amd64 has sse2
|
||||
sse2
|
||||
)
|
||||
|
||||
var extension = none
|
||||
|
||||
// Bytes : chose the shortest one as xor size
|
||||
// it's better to use it for big data ( > 64bytes )
|
||||
func Bytes(dst, src0, src1 []byte) {
|
||||
size := len(dst)
|
||||
if size > len(src0) {
|
||||
size = len(src0)
|
||||
}
|
||||
if size > len(src1) {
|
||||
size = len(src1)
|
||||
}
|
||||
xorBytes(dst, src0, src1, size)
|
||||
}
|
||||
|
||||
// BytesSameLen : all slice's length must be equal
|
||||
// cut size branch, save time for small data
|
||||
func BytesSameLen(dst, src0, src1 []byte) {
|
||||
xorSrc1(dst, src0, src1)
|
||||
}
|
||||
|
||||
// BytesSrc0 : src1 >= src0, dst >= src0
|
||||
// xor src0's len bytes
|
||||
func BytesSrc0(dst, src0, src1 []byte) {
|
||||
xorSrc0(dst, src0, src1)
|
||||
}
|
||||
|
||||
// BytesSrc1 : src0 >= src1, dst >= src1
|
||||
// xor src1's len bytes
|
||||
func BytesSrc1(dst, src0, src1 []byte) {
|
||||
xorSrc1(dst, src0, src1)
|
||||
}
|
||||
|
||||
// Matrix : all slice's length must be equal && != 0
|
||||
// len(src) must >= 2
|
||||
func Matrix(dst []byte, src [][]byte) {
|
||||
xorMatrix(dst, src)
|
||||
}
|
||||
118
vendor/github.com/templexxx/xor/xor_amd64.go
generated
vendored
Normal file
118
vendor/github.com/templexxx/xor/xor_amd64.go
generated
vendored
Normal file
@@ -0,0 +1,118 @@
|
||||
package xor
|
||||
|
||||
func init() {
|
||||
getEXT()
|
||||
}
|
||||
|
||||
func getEXT() {
|
||||
if hasAVX2() {
|
||||
extension = avx2
|
||||
} else {
|
||||
extension = sse2
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func xorBytes(dst, src0, src1 []byte, size int) {
|
||||
switch extension {
|
||||
case avx2:
|
||||
bytesAVX2(dst, src0, src1, size)
|
||||
default:
|
||||
bytesSSE2(dst, src0, src1, size)
|
||||
}
|
||||
}
|
||||
|
||||
// non-temporal hint store
|
||||
const nontmp = 8 * 1024
|
||||
const avx2loopsize = 128
|
||||
|
||||
func bytesAVX2(dst, src0, src1 []byte, size int) {
|
||||
if size < avx2loopsize {
|
||||
bytesAVX2mini(dst, src0, src1, size)
|
||||
} else if size >= avx2loopsize && size <= nontmp {
|
||||
bytesAVX2small(dst, src0, src1, size)
|
||||
} else {
|
||||
bytesAVX2big(dst, src0, src1, size)
|
||||
}
|
||||
}
|
||||
|
||||
const sse2loopsize = 64
|
||||
|
||||
func bytesSSE2(dst, src0, src1 []byte, size int) {
|
||||
if size < sse2loopsize {
|
||||
bytesSSE2mini(dst, src0, src1, size)
|
||||
} else if size >= sse2loopsize && size <= nontmp {
|
||||
bytesSSE2small(dst, src0, src1, size)
|
||||
} else {
|
||||
bytesSSE2big(dst, src0, src1, size)
|
||||
}
|
||||
}
|
||||
|
||||
func xorMatrix(dst []byte, src [][]byte) {
|
||||
switch extension {
|
||||
case avx2:
|
||||
matrixAVX2(dst, src)
|
||||
default:
|
||||
matrixSSE2(dst, src)
|
||||
}
|
||||
}
|
||||
|
||||
func matrixAVX2(dst []byte, src [][]byte) {
|
||||
size := len(dst)
|
||||
if size > nontmp {
|
||||
matrixAVX2big(dst, src)
|
||||
} else {
|
||||
matrixAVX2small(dst, src)
|
||||
}
|
||||
}
|
||||
|
||||
func matrixSSE2(dst []byte, src [][]byte) {
|
||||
size := len(dst)
|
||||
if size > nontmp {
|
||||
matrixSSE2big(dst, src)
|
||||
} else {
|
||||
matrixSSE2small(dst, src)
|
||||
}
|
||||
}
|
||||
|
||||
//go:noescape
|
||||
func xorSrc0(dst, src0, src1 []byte)
|
||||
|
||||
//go:noescape
|
||||
func xorSrc1(dst, src0, src1 []byte)
|
||||
|
||||
//go:noescape
|
||||
func bytesAVX2mini(dst, src0, src1 []byte, size int)
|
||||
|
||||
//go:noescape
|
||||
func bytesAVX2big(dst, src0, src1 []byte, size int)
|
||||
|
||||
//go:noescape
|
||||
func bytesAVX2small(dst, src0, src1 []byte, size int)
|
||||
|
||||
//go:noescape
|
||||
func bytesSSE2mini(dst, src0, src1 []byte, size int)
|
||||
|
||||
//go:noescape
|
||||
func bytesSSE2small(dst, src0, src1 []byte, size int)
|
||||
|
||||
//go:noescape
|
||||
func bytesSSE2big(dst, src0, src1 []byte, size int)
|
||||
|
||||
//go:noescape
|
||||
func matrixAVX2small(dst []byte, src [][]byte)
|
||||
|
||||
//go:noescape
|
||||
func matrixAVX2big(dst []byte, src [][]byte)
|
||||
|
||||
//go:noescape
|
||||
func matrixSSE2small(dst []byte, src [][]byte)
|
||||
|
||||
//go:noescape
|
||||
func matrixSSE2big(dst []byte, src [][]byte)
|
||||
|
||||
//go:noescape
|
||||
func hasAVX2() bool
|
||||
|
||||
//go:noescape
|
||||
func hasSSE2() bool
|
||||
19
vendor/github.com/templexxx/xor/xor_other.go
generated
vendored
Normal file
19
vendor/github.com/templexxx/xor/xor_other.go
generated
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
// +build !amd64 noasm
|
||||
|
||||
package xor
|
||||
|
||||
func xorBytes(dst, src0, src1 []byte, size int) {
|
||||
bytesNoSIMD(dst, src0, src1, size)
|
||||
}
|
||||
|
||||
func xorMatrix(dst []byte, src [][]byte) {
|
||||
matrixNoSIMD(dst, src)
|
||||
}
|
||||
|
||||
func xorSrc0(dst, src0, src1 []byte) {
|
||||
bytesNoSIMD(dst, src0, src1, len(src0))
|
||||
}
|
||||
|
||||
func xorSrc1(dst, src0, src1 []byte) {
|
||||
bytesNoSIMD(dst, src0, src1, len(src1))
|
||||
}
|
||||
Reference in New Issue
Block a user