diff --git a/_gen/gen.go b/_gen/gen.go index 4755c72a..36709e8e 100644 --- a/_gen/gen.go +++ b/_gen/gen.go @@ -2,7 +2,8 @@ // +build generate //go:generate go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon -//go:generate gofmt -w ../galois_gen_switch_amd64.go +//go:generate go fmt ../galois_gen_switch_amd64.go +//go:generate go fmt ../galois_gen_amd64.go package main @@ -36,14 +37,15 @@ func main() { Constraint(buildtags.Not("nogen").ToConstraint()) Constraint(buildtags.Term("gc").ToConstraint()) - const perLoopBits = 5 + const perLoopBits = 6 const perLoop = 1 << perLoopBits for i := 1; i <= inputMax; i++ { for j := 1; j <= outputMax; j++ { - //genMulAvx2(fmt.Sprintf("mulAvxTwoXor_%dx%d", i, j), i, j, true) genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%d", i, j), i, j, false) genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64", i, j), i, j, false) + genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%dXor", i, j), i, j, true) + genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64Xor", i, j), i, j, true) } } f, err := os.Create("../galois_gen_switch_amd64.go") @@ -62,19 +64,26 @@ func main() { package reedsolomon -import "fmt" +import ( + "fmt" +) `) - w.WriteString("const avx2CodeGen = true\n") - w.WriteString(fmt.Sprintf("const maxAvx2Inputs = %d\nconst maxAvx2Outputs = %d\n", inputMax, outputMax)) + w.WriteString(fmt.Sprintf(`const ( +avx2CodeGen = true +maxAvx2Inputs = %d +maxAvx2Outputs = %d +minAvx2Size = %d +avxSizeMask = maxInt - (minAvx2Size-1) +)`, inputMax, outputMax, perLoop)) w.WriteString(` func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { - n := stop-start + n := (stop-start) & avxSizeMask + `) - w.WriteString(fmt.Sprintf("n = (n>>%d)<<%d\n\n", perLoopBits, perLoopBits)) w.WriteString(`switch len(in) { `) for in, defs := range switchDefs[:] { @@ -88,6 +97,25 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { w.WriteString(`} panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } + +func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { + n := (stop-start) & avxSizeMask + +`) + + w.WriteString(`switch len(in) { +`) + for in, defs := range switchDefsX[:] { + w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1)) + for out, def := range defs[:] { + w.WriteString(fmt.Sprintf(" case %d:\n", out+1)) + w.WriteString(def) + } + w.WriteString("}\n") + } + w.WriteString(`} + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} `) Generate() } @@ -129,12 +157,21 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { } } + x := "" + if xor { + x = "Xor" + } + TEXT(name, attr.NOSPLIT, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)")) // SWITCH DEFINITION: - s := fmt.Sprintf(" mulAvxTwo_%dx%d(matrix, in, out, start, n)\n", inputs, outputs) + s := fmt.Sprintf(" mulAvxTwo_%dx%d%s(matrix, in, out, start, n)\n", inputs, outputs, x) s += fmt.Sprintf("\t\t\t\treturn n\n") - switchDefs[inputs-1][outputs-1] = s + if xor { + switchDefsX[inputs-1][outputs-1] = s + } else { + switchDefs[inputs-1][outputs-1] = s + } if loadNone { Comment("Loading no tables to registers") @@ -197,7 +234,6 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { if err != nil { panic(err) } - outBase := addr.Addr outSlicePtr := GP64() MOVQ(addr.Addr, outSlicePtr) for i := range dst { @@ -241,13 +277,13 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { SHRQ(U8(perLoopBits), length) } Label(name + "_loop") - if xor { + + // Load data before loop or during first iteration? + // No clear winner. + preloadInput := xor && false + if preloadInput { Commentf("Load %d outputs", outputs) - } else { - Commentf("Clear %d outputs", outputs) - } - for i := range dst { - if xor { + for i := range dst { if regDst { VMOVDQU(Mem{Base: dstPtr[i]}, dst[i]) if prefetchDst > 0 { @@ -256,13 +292,11 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { continue } ptr := GP64() - MOVQ(outBase, ptr) + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i]) if prefetchDst > 0 { PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) } - } else { - VPXOR(dst[i], dst[i], dst[i]) } } @@ -279,6 +313,22 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { VPAND(lowMask, inLow, inLow) VPAND(lowMask, inHigh, inHigh) for j := range dst { + //Commentf(" xor:%v i: %v", xor, i) + if !preloadInput && xor && i == 0 { + if regDst { + VMOVDQU(Mem{Base: dstPtr[j]}, dst[j]) + if prefetchDst > 0 { + PREFETCHT0(Mem{Base: dstPtr[j], Disp: prefetchDst}) + } + } else { + ptr := GP64() + MOVQ(Mem{Base: outSlicePtr, Disp: j * 24}, ptr) + VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[j]) + if prefetchDst > 0 { + PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) + } + } + } if loadNone { VMOVDQU(Mem{Base: matrixBase, Disp: 64 * (i*outputs + j)}, lookLow) VMOVDQU(Mem{Base: matrixBase, Disp: 32 + 64*(i*outputs+j)}, lookHigh) @@ -288,8 +338,13 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { VPSHUFB(inLow, inLo[i*outputs+j], lookLow) VPSHUFB(inHigh, inHi[i*outputs+j], lookHigh) } - VPXOR(lookLow, lookHigh, lookLow) - VPXOR(lookLow, dst[j], dst[j]) + if i == 0 && !xor { + // We don't have any existing data, write directly. + VPXOR(lookLow, lookHigh, dst[j]) + } else { + VPXOR(lookLow, lookHigh, lookLow) + VPXOR(lookLow, dst[j], dst[j]) + } } } Commentf("Store %d outputs", outputs) @@ -340,35 +395,42 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) { // Load shuffle masks on every use. var loadNone bool // Use registers for destination registers. - var regDst = false + var regDst = true var reloadLength = false // lo, hi, 1 in, 1 out, 2 tmp, 1 mask - est := total*2 + outputs + 5 + est := total*4 + outputs + 7 if outputs == 1 { // We don't need to keep a copy of the input if only 1 output. est -= 2 } - if true || est > 16 { + if est > 16 { loadNone = true // We run out of GP registers first, now. if inputs+outputs > 13 { regDst = false } // Save one register by reloading length. - if true || inputs+outputs > 12 && regDst { + if inputs+outputs > 12 && regDst { reloadLength = true } } TEXT(name, 0, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)")) - + x := "" + if xor { + x = "Xor" + } // SWITCH DEFINITION: - s := fmt.Sprintf("n = (n>>%d)<<%d\n", perLoopBits, perLoopBits) - s += fmt.Sprintf(" mulAvxTwo_%dx%d_64(matrix, in, out, start, n)\n", inputs, outputs) + //s := fmt.Sprintf("n = (n>>%d)<<%d\n", perLoopBits, perLoopBits) + s := fmt.Sprintf(" mulAvxTwo_%dx%d_64%s(matrix, in, out, start, n)\n", inputs, outputs, x) s += fmt.Sprintf("\t\t\t\treturn n\n") - switchDefs[inputs-1][outputs-1] = s + if xor { + switchDefsX[inputs-1][outputs-1] = s + } else { + switchDefs[inputs-1][outputs-1] = s + } if loadNone { Comment("Loading no tables to registers") @@ -474,33 +536,31 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) { VPBROADCASTB(lowMask.AsX(), lowMask) if reloadLength { + Commentf("Reload length to save a register") length = Load(Param("n"), GP64()) SHRQ(U8(perLoopBits), length) } Label(name + "_loop") + if xor { Commentf("Load %d outputs", outputs) - } else { - Commentf("Clear %d outputs", outputs) - } - for i := range dst { - if xor { + for i := range dst { if regDst { VMOVDQU(Mem{Base: dstPtr[i]}, dst[i]) + VMOVDQU(Mem{Base: dstPtr[i], Disp: 32}, dst2[i]) if prefetchDst > 0 { PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst}) } continue } ptr := GP64() - MOVQ(outBase, ptr) + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i]) + VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1, Disp: 32}, dst2[i]) + if prefetchDst > 0 { PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) } - } else { - VPXOR(dst[i], dst[i], dst[i]) - VPXOR(dst2[i], dst2[i], dst2[i]) } } @@ -536,10 +596,16 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) { VPSHUFB(inHigh, inHi[i*outputs+j], lookHigh) VPSHUFB(in2High, inHi[i*outputs+j], lookHigh2) } - VPXOR(lookLow, lookHigh, lookLow) - VPXOR(lookLow2, lookHigh2, lookLow2) - VPXOR(lookLow, dst[j], dst[j]) - VPXOR(lookLow2, dst2[j], dst2[j]) + if i == 0 && !xor { + // We don't have any existing data, write directly. + VPXOR(lookLow, lookHigh, dst[j]) + VPXOR(lookLow2, lookHigh2, dst2[j]) + } else { + VPXOR(lookLow, lookHigh, lookLow) + VPXOR(lookLow2, lookHigh2, lookLow2) + VPXOR(lookLow, dst[j], dst[j]) + VPXOR(lookLow2, dst2[j], dst2[j]) + } } } Commentf("Store %d outputs", outputs) diff --git a/galois.go b/galois.go index bc4de4fe..30e9e039 100644 --- a/galois.go +++ b/galois.go @@ -901,7 +901,7 @@ func galExp(a byte, n int) byte { return expTable[logResult] } -func genAvx2Matrix(matrixRows [][]byte, inputs, outputs int, dst []byte) []byte { +func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte { if !avx2CodeGen { panic("codegen not enabled") } @@ -915,7 +915,7 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, outputs int, dst []byte) []byte dst = dst[:wantBytes] } for i, row := range matrixRows[:outputs] { - for j, idx := range row[:inputs] { + for j, idx := range row[inIdx : inIdx+inputs] { dstIdx := (j*outputs + i) * 64 dstPart := dst[dstIdx:] dstPart = dstPart[:64] diff --git a/galoisAvx512_amd64.go b/galoisAvx512_amd64.go index 0f240b7d..79207e60 100644 --- a/galoisAvx512_amd64.go +++ b/galoisAvx512_amd64.go @@ -225,8 +225,9 @@ func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outp // Perform the same as codeSomeShards, but taking advantage of // AVX512 parallelism for up to 4x faster execution as compared to AVX2 -func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, byteCount int) { // Process using no goroutines + outputCount := len(outputs) start, end := 0, r.o.perRound if end > byteCount { end = byteCount @@ -272,7 +273,8 @@ func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, // Perform the same as codeSomeShards, but taking advantage of // AVX512 parallelism for up to 4x faster execution as compared to AVX2 -func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, byteCount int) { + outputCount := len(outputs) var wg sync.WaitGroup do := byteCount / r.o.maxGoroutines if do < r.o.minSplitSize { diff --git a/galoisAvx512_amd64_test.go b/galoisAvx512_amd64_test.go index 18acec85..6792e984 100644 --- a/galoisAvx512_amd64_test.go +++ b/galoisAvx512_amd64_test.go @@ -331,9 +331,9 @@ func testCodeSomeShardsAvx512WithLength(t *testing.T, ds, ps, l int, parallel bo } if parallel { - r.codeSomeShardsAvx512P(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0])) + r.codeSomeShardsAvx512P(r.parity, shards[:r.DataShards], shards[r.DataShards:], len(shards[0])) } else { - r.codeSomeShardsAvx512(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0])) + r.codeSomeShardsAvx512(r.parity, shards[:r.DataShards], shards[r.DataShards:r.DataShards+r.ParityShards], len(shards[0])) } correct, _ := r.Verify(shards) diff --git a/galois_amd64.go b/galois_amd64.go index 03754d2a..d722e314 100644 --- a/galois_amd64.go +++ b/galois_amd64.go @@ -107,6 +107,9 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { in = in[done:] out = out[done:] } + if len(in) == 0 { + return + } out = out[:len(in)] mt := mulTable[c][:256] for i := range in { diff --git a/galois_gen_amd64.go b/galois_gen_amd64.go index 15d45229..817c7ea4 100644 --- a/galois_gen_amd64.go +++ b/galois_gen_amd64.go @@ -15,6 +15,14 @@ func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x1Xor takes 1 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x1_64Xor takes 1 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x2 takes 1 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -25,6 +33,14 @@ func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x2Xor takes 1 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x2_64Xor takes 1 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x3 takes 1 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -35,41 +51,77 @@ func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x3Xor takes 1 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x3_64Xor takes 1 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x4 takes 1 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x4Xor takes 1 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x5 takes 1 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x5Xor takes 1 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x6 takes 1 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x6Xor takes 1 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x7 takes 1 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x7Xor takes 1 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x8 takes 1 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x8Xor takes 1 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x9 takes 1 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x9Xor takes 1 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x10 takes 1 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x10Xor takes 1 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x1 takes 2 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -80,6 +132,14 @@ func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x1Xor takes 2 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x1_64Xor takes 2 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x2 takes 2 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -90,6 +150,14 @@ func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x2Xor takes 2 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x2_64Xor takes 2 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x3 takes 2 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -100,41 +168,77 @@ func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x3Xor takes 2 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x3_64Xor takes 2 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x4 takes 2 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x4Xor takes 2 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x5 takes 2 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x5Xor takes 2 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x6 takes 2 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x6Xor takes 2 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x7 takes 2 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x7Xor takes 2 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x8 takes 2 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x8Xor takes 2 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x9 takes 2 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x9Xor takes 2 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x10 takes 2 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x10Xor takes 2 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x1 takes 3 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -145,6 +249,14 @@ func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x1Xor takes 3 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x1_64Xor takes 3 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x2 takes 3 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -155,6 +267,14 @@ func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x2Xor takes 3 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x2_64Xor takes 3 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x3 takes 3 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -165,41 +285,77 @@ func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x3Xor takes 3 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x3_64Xor takes 3 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x4 takes 3 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x4Xor takes 3 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x5 takes 3 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x5Xor takes 3 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x6 takes 3 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x6Xor takes 3 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x7 takes 3 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x7Xor takes 3 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x8 takes 3 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x8Xor takes 3 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x9 takes 3 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x9Xor takes 3 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x10 takes 3 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x10Xor takes 3 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x1 takes 4 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -210,6 +366,14 @@ func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x1Xor takes 4 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x1_64Xor takes 4 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x2 takes 4 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -220,6 +384,14 @@ func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x2Xor takes 4 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x2_64Xor takes 4 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x3 takes 4 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -230,41 +402,77 @@ func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x3Xor takes 4 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x3_64Xor takes 4 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x4 takes 4 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x4Xor takes 4 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x5 takes 4 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x5Xor takes 4 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x6 takes 4 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x6Xor takes 4 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x7 takes 4 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x7Xor takes 4 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x8 takes 4 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x8Xor takes 4 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x9 takes 4 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x9Xor takes 4 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x10 takes 4 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x10Xor takes 4 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x1 takes 5 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -275,6 +483,14 @@ func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x1Xor takes 5 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x1_64Xor takes 5 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x2 takes 5 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -285,6 +501,14 @@ func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x2Xor takes 5 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x2_64Xor takes 5 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x3 takes 5 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -295,41 +519,77 @@ func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x3Xor takes 5 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x3_64Xor takes 5 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x4 takes 5 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x4Xor takes 5 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x5 takes 5 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x5Xor takes 5 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x6 takes 5 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x6Xor takes 5 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x7 takes 5 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x7Xor takes 5 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x8 takes 5 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x8Xor takes 5 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x9 takes 5 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x9Xor takes 5 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x10 takes 5 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x10Xor takes 5 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x1 takes 6 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -340,6 +600,14 @@ func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x1Xor takes 6 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x1_64Xor takes 6 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x2 takes 6 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -350,6 +618,14 @@ func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x2Xor takes 6 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x2_64Xor takes 6 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x3 takes 6 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -360,41 +636,77 @@ func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x3Xor takes 6 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x3_64Xor takes 6 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x4 takes 6 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x4Xor takes 6 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x5 takes 6 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x5Xor takes 6 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x6 takes 6 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x6Xor takes 6 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x7 takes 6 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x7Xor takes 6 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x8 takes 6 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x8Xor takes 6 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x9 takes 6 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x9Xor takes 6 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x10 takes 6 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x10Xor takes 6 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x1 takes 7 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -405,6 +717,14 @@ func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x1Xor takes 7 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x1_64Xor takes 7 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x2 takes 7 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -415,6 +735,14 @@ func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x2Xor takes 7 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x2_64Xor takes 7 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x3 takes 7 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -425,41 +753,77 @@ func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x3Xor takes 7 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x3_64Xor takes 7 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x4 takes 7 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x4Xor takes 7 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x5 takes 7 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x5Xor takes 7 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x6 takes 7 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x6Xor takes 7 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x7 takes 7 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x7Xor takes 7 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x8 takes 7 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x8Xor takes 7 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x9 takes 7 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x9Xor takes 7 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x10 takes 7 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x10Xor takes 7 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x1 takes 8 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -470,6 +834,14 @@ func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x1Xor takes 8 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x1_64Xor takes 8 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x2 takes 8 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -480,6 +852,14 @@ func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x2Xor takes 8 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x2_64Xor takes 8 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x3 takes 8 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -490,41 +870,77 @@ func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x3Xor takes 8 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x3_64Xor takes 8 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x4 takes 8 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x4Xor takes 8 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x5 takes 8 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x5Xor takes 8 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x6 takes 8 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x6Xor takes 8 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x7 takes 8 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x7Xor takes 8 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x8 takes 8 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x8Xor takes 8 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x9 takes 8 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x9Xor takes 8 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x10 takes 8 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x10Xor takes 8 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x1 takes 9 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -535,6 +951,14 @@ func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x1Xor takes 9 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x1_64Xor takes 9 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x2 takes 9 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -545,6 +969,14 @@ func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x2Xor takes 9 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x2_64Xor takes 9 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x3 takes 9 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -555,41 +987,77 @@ func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x3Xor takes 9 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x3_64Xor takes 9 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x4 takes 9 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x4Xor takes 9 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x5 takes 9 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x5Xor takes 9 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x6 takes 9 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x6Xor takes 9 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x7 takes 9 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x7Xor takes 9 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x8 takes 9 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x8Xor takes 9 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x9 takes 9 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x9Xor takes 9 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x10 takes 9 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x10Xor takes 9 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x1 takes 10 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -600,6 +1068,14 @@ func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x1Xor takes 10 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x1_64Xor takes 10 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x2 takes 10 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -610,6 +1086,14 @@ func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x2Xor takes 10 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x2_64Xor takes 10 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x3 takes 10 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -620,37 +1104,73 @@ func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x3Xor takes 10 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x3_64Xor takes 10 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x4 takes 10 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x4Xor takes 10 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x5 takes 10 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x5Xor takes 10 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x6 takes 10 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x6Xor takes 10 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x7 takes 10 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x7Xor takes 10 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x8 takes 10 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x8Xor takes 10 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x9 takes 10 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x9Xor takes 10 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x10 takes 10 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x10Xor takes 10 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) diff --git a/galois_gen_amd64.s b/galois_gen_amd64.s index ab699ac6..36e885f9 100644 --- a/galois_gen_amd64.s +++ b/galois_gen_amd64.s @@ -36,15 +36,124 @@ TEXT ·mulAvxTwo_1x1(SB), NOSPLIT, $0-88 VPBROADCASTB X3, Y3 mulAvxTwo_1x1_loop: - // Clear 1 outputs - VPXOR Y2, Y2, Y2 + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y2 + ADDQ $0x20, CX + VPSRLQ $0x04, Y2, Y4 + VPAND Y3, Y2, Y2 + VPAND Y3, Y4, Y4 + VPSHUFB Y2, Y0, Y2 + VPSHUFB Y4, Y1, Y4 + VPXOR Y2, Y4, Y2 + + // Store 1 outputs + VMOVDQU Y2, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x1_loop + VZEROUPPER + +mulAvxTwo_1x1_end: + RET + +// func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_1x1_64_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + MOVQ $0x0000000f, BX + MOVQ BX, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_1x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + ADDQ $0x40, CX + VPSRLQ $0x04, Y2, Y6 + VPSRLQ $0x04, Y3, Y5 + VPAND Y4, Y2, Y2 + VPAND Y4, Y3, Y3 + VPAND Y4, Y6, Y6 + VPAND Y4, Y5, Y5 + VPSHUFB Y2, Y0, Y2 + VPSHUFB Y3, Y0, Y3 + VPSHUFB Y6, Y1, Y6 + VPSHUFB Y5, Y1, Y5 + VPXOR Y2, Y6, Y2 + VPXOR Y3, Y5, Y3 + + // Store 1 outputs + VMOVDQU Y2, (DX) + VMOVDQU Y3, 32(DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x1_64_loop + VZEROUPPER + +mulAvxTwo_1x1_64_end: + RET + +// func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x1Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x1Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + MOVQ $0x0000000f, BX + MOVQ BX, X3 + VPBROADCASTB X3, Y3 +mulAvxTwo_1x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (CX), Y4 ADDQ $0x20, CX VPSRLQ $0x04, Y4, Y5 VPAND Y3, Y4, Y4 VPAND Y3, Y5, Y5 + VMOVDQU (DX), Y2 VPSHUFB Y4, Y0, Y4 VPSHUFB Y5, Y1, Y5 VPXOR Y4, Y5, Y4 @@ -56,75 +165,76 @@ mulAvxTwo_1x1_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x1_loop + JNZ mulAvxTwo_1x1Xor_loop VZEROUPPER -mulAvxTwo_1x1_end: +mulAvxTwo_1x1Xor_end: RET -// func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_1x1_64(SB), $0-88 - // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 6 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulAvxTwo_1x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), AX - MOVQ out_base+48(FP), DX - MOVQ out_base+48(FP), DX - MOVQ start+72(FP), BX +TEXT ·mulAvxTwo_1x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_1x1_64Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX // Add start offset to input - ADDQ BX, AX - MOVQ $0x0000000f, SI - MOVQ SI, X2 - VPBROADCASTB X2, Y2 - MOVQ n+80(FP), SI - SHRQ $0x06, SI + ADDQ BX, CX + MOVQ $0x0000000f, BX + MOVQ BX, X4 + VPBROADCASTB X4, Y4 -mulAvxTwo_1x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 +mulAvxTwo_1x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (DX), Y2 + VMOVDQU 32(DX), Y3 // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y7 + ADDQ $0x40, CX + VPSRLQ $0x04, Y5, Y6 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y5, Y5 + VPAND Y4, Y7, Y7 + VPAND Y4, Y6, Y6 + VPAND Y4, Y8, Y8 + VPSHUFB Y5, Y0, Y5 + VPSHUFB Y7, Y0, Y7 + VPSHUFB Y6, Y1, Y6 + VPSHUFB Y8, Y1, Y8 VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 1 outputs - MOVQ (DX), DI - VMOVDQU Y0, (DI)(BX*1) - VMOVDQU Y1, 32(DI)(BX*1) + VMOVDQU Y2, (DX) + VMOVDQU Y3, 32(DX) + ADDQ $0x40, DX // Prepare for next loop - ADDQ $0x40, BX - DECQ SI - JNZ mulAvxTwo_1x1_64_loop + DECQ AX + JNZ mulAvxTwo_1x1_64Xor_loop VZEROUPPER -mulAvxTwo_1x1_64_end: +mulAvxTwo_1x1_64Xor_end: RET // func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -160,20 +270,151 @@ TEXT ·mulAvxTwo_1x2(SB), NOSPLIT, $0-88 VPBROADCASTB X6, Y6 mulAvxTwo_1x2_loop: - // Clear 2 outputs - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VPSRLQ $0x04, Y8, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y9, Y9 + VPSHUFB Y8, Y0, Y5 + VPSHUFB Y9, Y1, Y7 + VPXOR Y5, Y7, Y4 + VPSHUFB Y8, Y2, Y5 + VPSHUFB Y9, Y3, Y7 + VPXOR Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x2_loop + VZEROUPPER + +mulAvxTwo_1x2_end: + RET + +// func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x2_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_1x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + MOVQ $0x0000000f, DI + MOVQ DI, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_1x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y9 + ADDQ $0x40, DX + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y4, Y7, Y7 + VPAND Y4, Y9, Y9 + VPAND Y4, Y8, Y8 + VPAND Y4, Y10, Y10 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y6 + VPSHUFB Y9, Y2, Y3 + VPSHUFB Y7, Y2, Y2 + VPSHUFB Y10, Y6, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y2, Y6, Y0 + VPXOR Y3, Y5, Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y6 + VPSHUFB Y9, Y2, Y3 + VPSHUFB Y7, Y2, Y2 + VPSHUFB Y10, Y6, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y2, Y6, Y2 + VPXOR Y3, Y5, Y3 + + // Store 2 outputs + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (BX) + VMOVDQU Y3, 32(BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x2_64_loop + VZEROUPPER + +mulAvxTwo_1x2_64_end: + RET + +// func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x2Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x2Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + MOVQ $0x0000000f, SI + MOVQ SI, X6 + VPBROADCASTB X6, Y6 +mulAvxTwo_1x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (CX), Y9 ADDQ $0x20, CX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 + VMOVDQU (BX), Y4 VPSHUFB Y9, Y0, Y7 VPSHUFB Y10, Y1, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 + VMOVDQU (DX), Y5 VPSHUFB Y9, Y2, Y7 VPSHUFB Y10, Y3, Y8 VPXOR Y7, Y8, Y7 @@ -187,48 +428,52 @@ mulAvxTwo_1x2_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x2_loop + JNZ mulAvxTwo_1x2Xor_loop VZEROUPPER -mulAvxTwo_1x2_end: +mulAvxTwo_1x2Xor_end: RET -// func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_1x2_64(SB), $0-88 +TEXT ·mulAvxTwo_1x2_64Xor(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 11 YMM used + // Destination kept in GP registers + // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_1x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), AX - MOVQ out_base+48(FP), DX - MOVQ out_base+48(FP), DX - MOVQ start+72(FP), BX + JZ mulAvxTwo_1x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX // Add start offset to input - ADDQ BX, AX - MOVQ $0x0000000f, SI - MOVQ SI, X4 + ADDQ DI, DX + MOVQ $0x0000000f, DI + MOVQ DI, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), SI - SHRQ $0x06, SI -mulAvxTwo_1x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 +mulAvxTwo_1x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU (BX), Y2 + VMOVDQU 32(BX), Y3 // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -257,20 +502,19 @@ mulAvxTwo_1x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (DX), DI - VMOVDQU Y0, (DI)(BX*1) - VMOVDQU Y1, 32(DI)(BX*1) - MOVQ 24(DX), DI - VMOVDQU Y2, (DI)(BX*1) - VMOVDQU Y3, 32(DI)(BX*1) + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (BX) + VMOVDQU Y3, 32(BX) + ADDQ $0x40, BX // Prepare for next loop - ADDQ $0x40, BX - DECQ SI - JNZ mulAvxTwo_1x2_64_loop + DECQ AX + JNZ mulAvxTwo_1x2_64Xor_loop VZEROUPPER -mulAvxTwo_1x2_64_end: +mulAvxTwo_1x2_64Xor_end: RET // func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -310,25 +554,178 @@ TEXT ·mulAvxTwo_1x3(SB), NOSPLIT, $0-88 VPBROADCASTB X9, Y9 mulAvxTwo_1x3_loop: - // Clear 3 outputs - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VPSRLQ $0x04, Y11, Y12 + VPAND Y9, Y11, Y11 + VPAND Y9, Y12, Y12 + VPSHUFB Y11, Y0, Y8 + VPSHUFB Y12, Y1, Y10 + VPXOR Y8, Y10, Y6 + VPSHUFB Y11, Y2, Y8 + VPSHUFB Y12, Y3, Y10 + VPXOR Y8, Y10, Y7 + VPSHUFB Y11, Y4, Y8 + VPSHUFB Y12, Y5, Y10 + VPXOR Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x3_loop + VZEROUPPER + +mulAvxTwo_1x3_end: + RET + +// func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x3_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_1x3_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_1x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y6, Y9, Y9 + VPAND Y6, Y11, Y11 + VPAND Y6, Y10, Y10 + VPAND Y6, Y12, Y12 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y8 + VPSHUFB Y11, Y4, Y5 + VPSHUFB Y9, Y4, Y4 + VPSHUFB Y12, Y8, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y4, Y8, Y0 + VPXOR Y5, Y7, Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y8 + VPSHUFB Y11, Y4, Y5 + VPSHUFB Y9, Y4, Y4 + VPSHUFB Y12, Y8, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y4, Y8, Y2 + VPXOR Y5, Y7, Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y8 + VPSHUFB Y11, Y4, Y5 + VPSHUFB Y9, Y4, Y4 + VPSHUFB Y12, Y8, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y4, Y8, Y4 + VPXOR Y5, Y7, Y5 + + // Store 3 outputs + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y4, (BX) + VMOVDQU Y5, 32(BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x3_64_loop + VZEROUPPER + +mulAvxTwo_1x3_64_end: + RET + +// func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x3Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x3Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + MOVQ $0x0000000f, DI + MOVQ DI, X9 + VPBROADCASTB X9, Y9 +mulAvxTwo_1x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (CX), Y12 ADDQ $0x20, CX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 + VMOVDQU (BX), Y6 VPSHUFB Y12, Y0, Y10 VPSHUFB Y13, Y1, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 + VMOVDQU (SI), Y7 VPSHUFB Y12, Y2, Y10 VPSHUFB Y13, Y3, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 + VMOVDQU (DX), Y8 VPSHUFB Y12, Y4, Y10 VPSHUFB Y13, Y5, Y11 VPXOR Y10, Y11, Y10 @@ -344,50 +741,56 @@ mulAvxTwo_1x3_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x3_loop + JNZ mulAvxTwo_1x3Xor_loop VZEROUPPER -mulAvxTwo_1x3_end: +mulAvxTwo_1x3Xor_end: RET -// func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_1x3_64(SB), $0-88 +TEXT ·mulAvxTwo_1x3_64Xor(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 14 YMM used + // Destination kept in GP registers + // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_1x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), AX - MOVQ out_base+48(FP), DX - MOVQ out_base+48(FP), DX - MOVQ start+72(FP), BX + JZ mulAvxTwo_1x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX // Add start offset to input - ADDQ BX, AX - MOVQ $0x0000000f, SI - MOVQ SI, X6 + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), SI - SHRQ $0x06, SI -mulAvxTwo_1x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 +mulAvxTwo_1x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y3 + VMOVDQU (BX), Y4 + VMOVDQU 32(BX), Y5 // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -426,23 +829,22 @@ mulAvxTwo_1x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (DX), DI - VMOVDQU Y0, (DI)(BX*1) - VMOVDQU Y1, 32(DI)(BX*1) - MOVQ 24(DX), DI - VMOVDQU Y2, (DI)(BX*1) - VMOVDQU Y3, 32(DI)(BX*1) - MOVQ 48(DX), DI - VMOVDQU Y4, (DI)(BX*1) - VMOVDQU Y5, 32(DI)(BX*1) - - // Prepare for next loop - ADDQ $0x40, BX - DECQ SI - JNZ mulAvxTwo_1x3_64_loop + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y4, (BX) + VMOVDQU Y5, 32(BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x3_64Xor_loop VZEROUPPER -mulAvxTwo_1x3_64_end: +mulAvxTwo_1x3_64Xor_end: RET // func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -478,42 +880,32 @@ TEXT ·mulAvxTwo_1x4(SB), NOSPLIT, $0-88 VPBROADCASTB X4, Y4 mulAvxTwo_1x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs - VMOVDQU (DX), Y7 + VMOVDQU (DX), Y6 ADDQ $0x20, DX - VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y6, Y7 + VPAND Y4, Y6, Y6 VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU (CX), Y5 - VMOVDQU 32(CX), Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y3, Y3 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 64(CX), Y5 - VMOVDQU 96(CX), Y6 + VPXOR Y3, Y5, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y3, Y3 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 128(CX), Y5 - VMOVDQU 160(CX), Y6 + VPXOR Y3, Y5, Y1 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y3, Y3 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 192(CX), Y5 - VMOVDQU 224(CX), Y6 + VPXOR Y3, Y5, Y2 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y3, Y3 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y3, Y5, Y3 // Store 4 outputs VMOVDQU Y0, (SI) @@ -533,29 +925,115 @@ mulAvxTwo_1x4_loop: mulAvxTwo_1x4_end: RET -// func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_1x5(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_1x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 20 YMM used + // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_1x5_end + JZ mulAvxTwo_1x4Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 - MOVQ 72(BX), R9 - MOVQ 96(BX), BX - MOVQ start+72(FP), R10 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 // Add start offset to output - ADDQ R10, SI + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_1x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (SI), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (DI), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R8), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (BX), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x4Xor_loop + VZEROUPPER + +mulAvxTwo_1x4Xor_end: + RET + +// func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x5(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 @@ -568,43 +1046,128 @@ TEXT ·mulAvxTwo_1x5(SB), NOSPLIT, $0-88 VPBROADCASTB X5, Y5 mulAvxTwo_1x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y5, Y7, Y7 + VPAND Y5, Y8, Y8 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y2 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y3 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y4 + + // Store 5 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x5_loop + VZEROUPPER + +mulAvxTwo_1x5_end: + RET + +// func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, BX + // Add start offset to input + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_1x5Xor_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 + VMOVDQU (SI), Y0 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 + VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 + VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 + VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 + VMOVDQU (BX), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 @@ -626,10 +1189,10 @@ mulAvxTwo_1x5_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x5_loop + JNZ mulAvxTwo_1x5Xor_loop VZEROUPPER -mulAvxTwo_1x5_end: +mulAvxTwo_1x5Xor_end: RET // func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -669,50 +1232,144 @@ TEXT ·mulAvxTwo_1x6(SB), NOSPLIT, $0-88 VPBROADCASTB X6, Y6 mulAvxTwo_1x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y9, Y9 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y3 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y4 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y5 + + // Store 6 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x6_loop + VZEROUPPER + +mulAvxTwo_1x6_end: + RET + +// func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), BX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, BX + + // Add start offset to input + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X6 + VPBROADCASTB X6, Y6 +mulAvxTwo_1x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 + VMOVDQU (SI), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 + VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 + VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 + VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 + VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 + VMOVDQU (BX), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 @@ -736,10 +1393,10 @@ mulAvxTwo_1x6_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x6_loop + JNZ mulAvxTwo_1x6Xor_loop VZEROUPPER -mulAvxTwo_1x6_end: +mulAvxTwo_1x6Xor_end: RET // func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -781,57 +1438,160 @@ TEXT ·mulAvxTwo_1x7(SB), NOSPLIT, $0-88 VPBROADCASTB X7, Y7 mulAvxTwo_1x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y7, Y9, Y9 + VPAND Y7, Y10, Y10 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y4 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y5 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y6 + + // Store 7 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x7_loop + VZEROUPPER + +mulAvxTwo_1x7_end: + RET + +// func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), BX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, BX + + // Add start offset to input + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X7 + VPBROADCASTB X7, Y7 +mulAvxTwo_1x7Xor_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 + VMOVDQU (SI), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 + VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 + VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 + VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 + VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 + VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 + VMOVDQU (BX), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 @@ -857,10 +1617,10 @@ mulAvxTwo_1x7_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x7_loop + JNZ mulAvxTwo_1x7Xor_loop VZEROUPPER -mulAvxTwo_1x7_end: +mulAvxTwo_1x7Xor_end: RET // func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -904,64 +1664,176 @@ TEXT ·mulAvxTwo_1x8(SB), NOSPLIT, $0-88 VPBROADCASTB X8, Y8 mulAvxTwo_1x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y8, Y10, Y10 + VPAND Y8, Y11, Y11 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y5 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y6 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y7 + + // Store 8 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (R12) + ADDQ $0x20, R12 + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x8_loop + VZEROUPPER + +mulAvxTwo_1x8_end: + RET + +// func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X8 + VPBROADCASTB X8, Y8 +mulAvxTwo_1x8Xor_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 + VMOVDQU (SI), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 + VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 + VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 + VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 + VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 + VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 + VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 + VMOVDQU (BX), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 @@ -989,10 +1861,10 @@ mulAvxTwo_1x8_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x8_loop + JNZ mulAvxTwo_1x8Xor_loop VZEROUPPER -mulAvxTwo_1x8_end: +mulAvxTwo_1x8Xor_end: RET // func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -1038,77 +1910,57 @@ TEXT ·mulAvxTwo_1x9(SB), NOSPLIT, $0-88 VPBROADCASTB X9, Y9 mulAvxTwo_1x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - // Load and process 32 bytes from input 0 to 9 outputs - VMOVDQU (DX), Y12 + VMOVDQU (DX), Y11 ADDQ $0x20, DX - VPSRLQ $0x04, Y12, Y13 + VPSRLQ $0x04, Y11, Y12 + VPAND Y9, Y11, Y11 VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU (CX), Y10 - VMOVDQU 32(CX), Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 64(CX), Y10 - VMOVDQU 96(CX), Y11 + VPXOR Y8, Y10, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 128(CX), Y10 - VMOVDQU 160(CX), Y11 + VPXOR Y8, Y10, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 192(CX), Y10 - VMOVDQU 224(CX), Y11 + VPXOR Y8, Y10, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 256(CX), Y10 - VMOVDQU 288(CX), Y11 + VPXOR Y8, Y10, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 320(CX), Y10 - VMOVDQU 352(CX), Y11 + VPXOR Y8, Y10, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 384(CX), Y10 - VMOVDQU 416(CX), Y11 + VPXOR Y8, Y10, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 448(CX), Y10 - VMOVDQU 480(CX), Y11 + VPXOR Y8, Y10, Y6 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 512(CX), Y10 - VMOVDQU 544(CX), Y11 + VPXOR Y8, Y10, Y7 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y8, Y8 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPXOR Y8, Y10, Y8 // Store 9 outputs VMOVDQU Y0, (SI) @@ -1138,17 +1990,17 @@ mulAvxTwo_1x9_loop: mulAvxTwo_1x9_end: RET -// func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_1x10(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_1x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 35 YMM used + // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_1x10_end + JZ mulAvxTwo_1x9Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX @@ -1160,9 +2012,150 @@ TEXT ·mulAvxTwo_1x10(SB), NOSPLIT, $0-88 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 - MOVQ 192(BX), R14 - MOVQ 216(BX), BX - MOVQ start+72(FP), R15 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_1x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (SI), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU (DI), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU (R8), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU (R9), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU (R10), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU (R11), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU (R12), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU (R13), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU (BX), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (R12) + ADDQ $0x20, R12 + VMOVDQU Y7, (R13) + ADDQ $0x20, R13 + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x9Xor_loop + VZEROUPPER + +mulAvxTwo_1x9Xor_end: + RET + +// func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x10(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, SI @@ -1183,78 +2176,208 @@ TEXT ·mulAvxTwo_1x10(SB), NOSPLIT, $0-88 VPBROADCASTB X10, Y10 mulAvxTwo_1x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y10, Y12, Y12 + VPAND Y10, Y13, Y13 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y7 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y8 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y9 + + // Store 10 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (R12) + ADDQ $0x20, R12 + VMOVDQU Y7, (R13) + ADDQ $0x20, R13 + VMOVDQU Y8, (R14) + ADDQ $0x20, R14 + VMOVDQU Y9, (BX) + ADDQ $0x20, BX + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x10_loop + VZEROUPPER + +mulAvxTwo_1x10_end: + RET + +// func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_1x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 + VMOVDQU (SI), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 + VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 + VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 + VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 + VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 + VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 + VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 + VMOVDQU (R13), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 + VMOVDQU (R14), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 + VMOVDQU (BX), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 @@ -1286,10 +2409,10 @@ mulAvxTwo_1x10_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x10_loop + JNZ mulAvxTwo_1x10Xor_loop VZEROUPPER -mulAvxTwo_1x10_end: +mulAvxTwo_1x10Xor_end: RET // func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -1325,9 +2448,6 @@ TEXT ·mulAvxTwo_2x1(SB), NOSPLIT, $0-88 VPBROADCASTB X5, Y5 mulAvxTwo_2x1_loop: - // Clear 1 outputs - VPXOR Y4, Y4, Y4 - // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -1336,8 +2456,7 @@ mulAvxTwo_2x1_loop: VPAND Y5, Y7, Y7 VPSHUFB Y6, Y0, Y6 VPSHUFB Y7, Y1, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (CX), Y6 @@ -1365,91 +2484,243 @@ mulAvxTwo_2x1_end: // func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x1_64(SB), $0-88 - // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 8 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulAvxTwo_2x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), AX - MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX - MOVQ start+72(FP), SI + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_2x1_64_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX // Add start offset to input ADDQ SI, DX - ADDQ SI, AX - MOVQ $0x0000000f, DI - MOVQ DI, X2 - VPBROADCASTB X2, Y2 - MOVQ n+80(FP), DI - SHRQ $0x06, DI + ADDQ SI, CX + MOVQ $0x0000000f, SI + MOVQ SI, X6 + VPBROADCASTB X6, Y6 mulAvxTwo_2x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y9 ADDQ $0x40, DX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y7, Y7 + VPAND Y6, Y9, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y10, Y10 + VPSHUFB Y7, Y0, Y7 + VPSHUFB Y9, Y0, Y9 + VPSHUFB Y8, Y1, Y8 + VPSHUFB Y10, Y1, Y10 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y9 + ADDQ $0x40, CX + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y7, Y7 + VPAND Y6, Y9, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y10, Y10 + VPSHUFB Y7, Y2, Y7 + VPSHUFB Y9, Y2, Y9 + VPSHUFB Y8, Y3, Y8 + VPSHUFB Y10, Y3, Y10 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 1 outputs - MOVQ (BX), R8 - VMOVDQU Y0, (R8)(SI*1) - VMOVDQU Y1, 32(R8)(SI*1) + VMOVDQU Y4, (BX) + VMOVDQU Y5, 32(BX) + ADDQ $0x40, BX // Prepare for next loop - ADDQ $0x40, SI - DECQ DI + DECQ AX JNZ mulAvxTwo_2x1_64_loop VZEROUPPER mulAvxTwo_2x1_64_end: RET +// func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x1Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x1Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + MOVQ $0x0000000f, SI + MOVQ SI, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_2x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y5, Y6, Y6 + VPAND Y5, Y7, Y7 + VMOVDQU (BX), Y4 + VPSHUFB Y6, Y0, Y6 + VPSHUFB Y7, Y1, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VPSRLQ $0x04, Y6, Y7 + VPAND Y5, Y6, Y6 + VPAND Y5, Y7, Y7 + VPSHUFB Y6, Y2, Y6 + VPSHUFB Y7, Y3, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 1 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x1Xor_loop + VZEROUPPER + +mulAvxTwo_2x1Xor_end: + RET + +// func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_2x1_64Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + MOVQ $0x0000000f, SI + MOVQ SI, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_2x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (BX), Y4 + VMOVDQU 32(BX), Y5 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y9 + ADDQ $0x40, DX + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y7, Y7 + VPAND Y6, Y9, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y10, Y10 + VPSHUFB Y7, Y0, Y7 + VPSHUFB Y9, Y0, Y9 + VPSHUFB Y8, Y1, Y8 + VPSHUFB Y10, Y1, Y10 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y9 + ADDQ $0x40, CX + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y7, Y7 + VPAND Y6, Y9, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y10, Y10 + VPSHUFB Y7, Y2, Y7 + VPSHUFB Y9, Y2, Y9 + VPSHUFB Y8, Y3, Y8 + VPSHUFB Y10, Y3, Y10 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 1 outputs + VMOVDQU Y4, (BX) + VMOVDQU Y5, 32(BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_2x1_64Xor_end: + RET + // func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x2(SB), NOSPLIT, $0-88 @@ -1489,10 +2760,6 @@ TEXT ·mulAvxTwo_2x2(SB), NOSPLIT, $0-88 VPBROADCASTB X10, Y10 mulAvxTwo_2x2_loop: - // Clear 2 outputs - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 - // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -1501,12 +2768,10 @@ mulAvxTwo_2x2_loop: VPAND Y10, Y14, Y14 VPSHUFB Y13, Y0, Y11 VPSHUFB Y14, Y1, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + VPXOR Y11, Y12, Y8 VPSHUFB Y13, Y2, Y11 VPSHUFB Y14, Y3, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (CX), Y13 @@ -1541,40 +2806,38 @@ mulAvxTwo_2x2_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x2_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 15 YMM used + // Destination kept in GP registers + // Full registers estimated 25 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_2x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), AX - MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX - MOVQ start+72(FP), SI + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI // Add start offset to input - ADDQ SI, DX - ADDQ SI, AX - MOVQ $0x0000000f, DI - MOVQ DI, X4 + ADDQ R8, BX + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), DI - SHRQ $0x06, DI mulAvxTwo_2x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -1587,10 +2850,207 @@ mulAvxTwo_2x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y2, (SI) + VMOVDQU Y3, 32(SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x2_64_loop + VZEROUPPER + +mulAvxTwo_2x2_64_end: + RET + +// func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x2Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 15 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x2Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + MOVQ $0x0000000f, DI + MOVQ DI, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_2x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (SI), Y8 + VPSHUFB Y13, Y0, Y11 + VPSHUFB Y14, Y1, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU (BX), Y9 + VPSHUFB Y13, Y2, Y11 + VPSHUFB Y14, Y3, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y13 + ADDQ $0x20, CX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VPSHUFB Y13, Y4, Y11 + VPSHUFB Y14, Y5, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VPSHUFB Y13, Y6, Y11 + VPSHUFB Y14, Y7, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 2 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x2Xor_loop + VZEROUPPER + +mulAvxTwo_2x2Xor_end: + RET + +// func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 25 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_2x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, BX + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_2x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (DI), Y0 + VMOVDQU 32(DI), Y1 + VMOVDQU (SI), Y2 + VMOVDQU 32(SI), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 @@ -1603,9 +3063,9 @@ mulAvxTwo_2x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -1634,20 +3094,19 @@ mulAvxTwo_2x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (BX), R8 - VMOVDQU Y0, (R8)(SI*1) - VMOVDQU Y1, 32(R8)(SI*1) - MOVQ 24(BX), R8 - VMOVDQU Y2, (R8)(SI*1) - VMOVDQU Y3, 32(R8)(SI*1) + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y2, (SI) + VMOVDQU Y3, 32(SI) + ADDQ $0x40, SI // Prepare for next loop - ADDQ $0x40, SI - DECQ DI - JNZ mulAvxTwo_2x2_64_loop + DECQ AX + JNZ mulAvxTwo_2x2_64Xor_loop VZEROUPPER -mulAvxTwo_2x2_64_end: +mulAvxTwo_2x2_64Xor_end: RET // func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -1683,11 +3142,6 @@ TEXT ·mulAvxTwo_2x3(SB), NOSPLIT, $0-88 VPBROADCASTB X3, Y3 mulAvxTwo_2x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX @@ -1698,20 +3152,17 @@ mulAvxTwo_2x3_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + VPXOR Y4, Y5, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + VPXOR Y4, Y5, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y4, Y5, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (DX), Y6 @@ -1758,39 +3209,72 @@ mulAvxTwo_2x3_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x3_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 20 YMM used + // Destination kept in GP registers + // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_2x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), AX - MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX - MOVQ start+72(FP), SI + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI // Add start offset to input - ADDQ SI, DX - ADDQ SI, AX - MOVQ $0x0000000f, DI - MOVQ DI, X6 + ADDQ R9, BX + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), DI - SHRQ $0x06, DI mulAvxTwo_2x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX @@ -1800,6 +3284,210 @@ mulAvxTwo_2x3_64_loop: VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y2, (R8) + VMOVDQU Y3, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y4, (SI) + VMOVDQU Y5, 32(SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x3_64_loop + VZEROUPPER + +mulAvxTwo_2x3_64_end: + RET + +// func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x3Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x3Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_2x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (SI), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x3Xor_loop + VZEROUPPER + +mulAvxTwo_2x3Xor_end: + RET + +// func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_2x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_2x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (DI), Y0 + VMOVDQU 32(DI), Y1 + VMOVDQU (R8), Y2 + VMOVDQU 32(R8), Y3 + VMOVDQU (SI), Y4 + VMOVDQU 32(SI), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 @@ -1832,9 +3520,9 @@ mulAvxTwo_2x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -1873,23 +3561,22 @@ mulAvxTwo_2x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (BX), R8 - VMOVDQU Y0, (R8)(SI*1) - VMOVDQU Y1, 32(R8)(SI*1) - MOVQ 24(BX), R8 - VMOVDQU Y2, (R8)(SI*1) - VMOVDQU Y3, 32(R8)(SI*1) - MOVQ 48(BX), R8 - VMOVDQU Y4, (R8)(SI*1) - VMOVDQU Y5, 32(R8)(SI*1) - - // Prepare for next loop - ADDQ $0x40, SI - DECQ DI - JNZ mulAvxTwo_2x3_64_loop + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y2, (R8) + VMOVDQU Y3, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y4, (SI) + VMOVDQU Y5, 32(SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x3_64Xor_loop VZEROUPPER -mulAvxTwo_2x3_64_end: +mulAvxTwo_2x3_64Xor_end: RET // func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -1927,12 +3614,6 @@ TEXT ·mulAvxTwo_2x4(SB), NOSPLIT, $0-88 VPBROADCASTB X4, Y4 mulAvxTwo_2x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX @@ -1943,26 +3624,22 @@ mulAvxTwo_2x4_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (DX), Y7 @@ -2013,17 +3690,17 @@ mulAvxTwo_2x4_loop: mulAvxTwo_2x4_end: RET -// func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_2x5(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_2x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 30 YMM used + // Full registers estimated 25 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_2x5_end + JZ mulAvxTwo_2x4Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX @@ -2031,62 +3708,305 @@ TEXT ·mulAvxTwo_2x5(SB), NOSPLIT, $0-88 MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 - MOVQ 72(SI), R10 - MOVQ 96(SI), SI - MOVQ start+72(FP), R11 + MOVQ 72(SI), SI + MOVQ start+72(FP), R10 // Add start offset to output - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, R9 - ADDQ R11, R10 - ADDQ R11, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, SI // Add start offset to input - ADDQ R11, BX - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X5 - VPBROADCASTB X5, Y5 - -mulAvxTwo_2x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 + ADDQ R10, BX + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X4 + VPBROADCASTB X4, Y4 - // Load and process 32 bytes from input 0 to 5 outputs - VMOVDQU (BX), Y8 +mulAvxTwo_2x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 ADDQ $0x20, BX - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU (CX), Y6 - VMOVDQU 32(CX), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 64(CX), Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (SI), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x4Xor_loop + VZEROUPPER + +mulAvxTwo_2x4Xor_end: + RET + +// func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x5(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_2x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x5_loop + VZEROUPPER + +mulAvxTwo_2x5_end: + RET + +// func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_2x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 + VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 + VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 + VMOVDQU (SI), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 @@ -2145,10 +4065,10 @@ mulAvxTwo_2x5_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_2x5_loop + JNZ mulAvxTwo_2x5Xor_loop VZEROUPPER -mulAvxTwo_2x5_end: +mulAvxTwo_2x5Xor_end: RET // func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -2190,50 +4110,189 @@ TEXT ·mulAvxTwo_2x6(SB), NOSPLIT, $0-88 VPBROADCASTB X6, Y6 mulAvxTwo_2x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x6_loop + VZEROUPPER + +mulAvxTwo_2x6_end: + RET + +// func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X6 + VPBROADCASTB X6, Y6 +mulAvxTwo_2x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 + VMOVDQU (DI), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 + VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 + VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 + VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 + VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 + VMOVDQU (SI), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 @@ -2300,10 +4359,10 @@ mulAvxTwo_2x6_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_2x6_loop + JNZ mulAvxTwo_2x6Xor_loop VZEROUPPER -mulAvxTwo_2x6_end: +mulAvxTwo_2x6Xor_end: RET // func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -2347,15 +4406,6 @@ TEXT ·mulAvxTwo_2x7(SB), NOSPLIT, $0-88 VPBROADCASTB X7, Y7 mulAvxTwo_2x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX @@ -2366,44 +4416,37 @@ mulAvxTwo_2x7_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (DX), Y10 @@ -2478,17 +4521,17 @@ mulAvxTwo_2x7_loop: mulAvxTwo_2x7_end: RET -// func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_2x8(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_2x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 45 YMM used + // Full registers estimated 40 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_2x8_end + JZ mulAvxTwo_2x7Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX @@ -2499,93 +4542,245 @@ TEXT ·mulAvxTwo_2x8(SB), NOSPLIT, $0-88 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 - MOVQ 144(SI), R13 - MOVQ 168(SI), SI - MOVQ start+72(FP), R14 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 // Add start offset to output - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, R13 - ADDQ R14, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI // Add start offset to input - ADDQ R14, BX - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X8 - VPBROADCASTB X8, Y8 - -mulAvxTwo_2x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 + ADDQ R13, BX + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X7 + VPBROADCASTB X7, Y7 - // Load and process 32 bytes from input 0 to 8 outputs - VMOVDQU (BX), Y11 +mulAvxTwo_2x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 ADDQ $0x20, BX - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU (CX), Y9 - VMOVDQU 32(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 64(CX), Y9 - VMOVDQU 96(CX), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 128(CX), Y9 - VMOVDQU 160(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 192(CX), Y9 - VMOVDQU 224(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 256(CX), Y9 - VMOVDQU 288(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 320(CX), Y9 - VMOVDQU 352(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 384(CX), Y9 - VMOVDQU 416(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU (R12), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 448(CX), Y9 - VMOVDQU 480(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU (SI), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x7Xor_loop + VZEROUPPER + +mulAvxTwo_2x7Xor_end: + RET + +// func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x8(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 45 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x8_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_2x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -2667,6 +4862,193 @@ mulAvxTwo_2x8_loop: mulAvxTwo_2x8_end: RET +// func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 45 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_2x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU (R12), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU (R13), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU (SI), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x8Xor_loop + VZEROUPPER + +mulAvxTwo_2x8Xor_end: + RET + // func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x9(SB), NOSPLIT, $0-88 @@ -2712,71 +5094,255 @@ TEXT ·mulAvxTwo_2x9(SB), NOSPLIT, $0-88 VPBROADCASTB X9, Y9 mulAvxTwo_2x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (R14) + ADDQ $0x20, R14 + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x9_loop + VZEROUPPER + +mulAvxTwo_2x9_end: + RET + +// func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X9 + VPBROADCASTB X9, Y9 +mulAvxTwo_2x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 + VMOVDQU (DI), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 + VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 + VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 + VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 + VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 + VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 + VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 + VMOVDQU (R14), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 + VMOVDQU (SI), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 @@ -2867,10 +5433,10 @@ mulAvxTwo_2x9_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_2x9_loop + JNZ mulAvxTwo_2x9Xor_loop VZEROUPPER -mulAvxTwo_2x9_end: +mulAvxTwo_2x9Xor_end: RET // func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -2920,18 +5486,6 @@ TEXT ·mulAvxTwo_2x10(SB), NOSPLIT, $8-88 VPBROADCASTB X10, Y10 mulAvxTwo_2x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 - // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX @@ -2942,56 +5496,267 @@ mulAvxTwo_2x10_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (R14) + ADDQ $0x20, R14 + VMOVDQU Y8, (R15) + ADDQ $0x20, R15 + VMOVDQU Y9, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x10_loop + VZEROUPPER + +mulAvxTwo_2x10_end: + RET + +// func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x10Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 55 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_2x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 + VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 + VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 + VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 + VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 + VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 + VMOVDQU (R14), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 + VMOVDQU (R15), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 + VMOVDQU (SI), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 @@ -3090,10 +5855,10 @@ mulAvxTwo_2x10_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_2x10_loop + JNZ mulAvxTwo_2x10Xor_loop VZEROUPPER -mulAvxTwo_2x10_end: +mulAvxTwo_2x10Xor_end: RET // func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -3133,9 +5898,6 @@ TEXT ·mulAvxTwo_3x1(SB), NOSPLIT, $0-88 VPBROADCASTB X7, Y7 mulAvxTwo_3x1_loop: - // Clear 1 outputs - VPXOR Y6, Y6, Y6 - // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -3144,8 +5906,7 @@ mulAvxTwo_3x1_loop: VPAND Y7, Y9, Y9 VPSHUFB Y8, Y0, Y8 VPSHUFB Y9, Y1, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y8 @@ -3185,37 +5946,75 @@ mulAvxTwo_3x1_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x1_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 10 YMM used + // Destination kept in GP registers + // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), AX - MOVQ out_base+48(FP), SI - MOVQ out_base+48(FP), SI - MOVQ start+72(FP), DI + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI // Add start offset to input - ADDQ DI, DX - ADDQ DI, BX - ADDQ DI, AX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DX MOVQ $0x0000000f, R8 MOVQ R8, X2 VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R8 - SHRQ $0x06, R8 mulAvxTwo_3x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX @@ -3225,8 +6024,8 @@ mulAvxTwo_3x1_64_loop: VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 @@ -3236,7 +6035,139 @@ mulAvxTwo_3x1_64_loop: VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 - // Load and process 64 bytes from input 1 to 1 outputs + // Store 1 outputs + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x1_64_loop + VZEROUPPER + +mulAvxTwo_3x1_64_end: + RET + +// func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x1Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x1Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + MOVQ $0x0000000f, DI + MOVQ DI, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_3x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y7, Y8, Y8 + VPAND Y7, Y9, Y9 + VMOVDQU (SI), Y6 + VPSHUFB Y8, Y0, Y8 + VPSHUFB Y9, Y1, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y7, Y8, Y8 + VPAND Y7, Y9, Y9 + VPSHUFB Y8, Y2, Y8 + VPSHUFB Y9, Y3, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VPSRLQ $0x04, Y8, Y9 + VPAND Y7, Y8, Y8 + VPAND Y7, Y9, Y9 + VPSHUFB Y8, Y4, Y8 + VPSHUFB Y9, Y5, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 1 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x1Xor_loop + VZEROUPPER + +mulAvxTwo_3x1Xor_end: + RET + +// func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_3x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_3x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (DI), Y0 + VMOVDQU 32(DI), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX @@ -3246,6 +6177,27 @@ mulAvxTwo_3x1_64_loop: VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 @@ -3258,9 +6210,9 @@ mulAvxTwo_3x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -3279,17 +6231,16 @@ mulAvxTwo_3x1_64_loop: VPXOR Y5, Y1, Y1 // Store 1 outputs - MOVQ (SI), R9 - VMOVDQU Y0, (R9)(DI*1) - VMOVDQU Y1, 32(R9)(DI*1) + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI // Prepare for next loop - ADDQ $0x40, DI - DECQ R8 - JNZ mulAvxTwo_3x1_64_loop + DECQ AX + JNZ mulAvxTwo_3x1_64Xor_loop VZEROUPPER -mulAvxTwo_3x1_64_end: +mulAvxTwo_3x1_64Xor_end: RET // func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -3325,10 +6276,6 @@ TEXT ·mulAvxTwo_3x2(SB), NOSPLIT, $0-88 VPBROADCASTB X2, Y2 mulAvxTwo_3x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX @@ -3339,14 +6286,12 @@ mulAvxTwo_3x2_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + VPXOR Y3, Y4, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y3, Y4, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -3404,42 +6349,40 @@ mulAvxTwo_3x2_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x2_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 19 YMM used + // Destination kept in GP registers + // Full registers estimated 33 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), AX - MOVQ out_base+48(FP), SI - MOVQ out_base+48(FP), SI - MOVQ start+72(FP), DI + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI // Add start offset to input - ADDQ DI, DX - ADDQ DI, BX - ADDQ DI, AX - MOVQ $0x0000000f, R8 - MOVQ R8, X4 + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R8 - SHRQ $0x06, R8 mulAvxTwo_3x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -3452,25 +6395,21 @@ mulAvxTwo_3x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -3499,9 +6438,9 @@ mulAvxTwo_3x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -3530,62 +6469,311 @@ mulAvxTwo_3x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (SI), R9 - VMOVDQU Y0, (R9)(DI*1) - VMOVDQU Y1, 32(R9)(DI*1) - MOVQ 24(SI), R9 - VMOVDQU Y2, (R9)(DI*1) - VMOVDQU Y3, 32(R9)(DI*1) + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI // Prepare for next loop - ADDQ $0x40, DI - DECQ R8 + DECQ AX JNZ mulAvxTwo_3x2_64_loop VZEROUPPER mulAvxTwo_3x2_64_end: RET -// func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_3x3(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_3x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 26 YMM used + // Full registers estimated 19 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_3x3_end + JZ mulAvxTwo_3x2Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 - MOVQ 24(DI), R9 - MOVQ 48(DI), DI - MOVQ start+72(FP), R10 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 // Add start offset to output - ADDQ R10, R8 - ADDQ R10, R9 - ADDQ R10, DI + ADDQ R9, R8 + ADDQ R9, DI // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X3 - VPBROADCASTB X3, Y3 + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X2 + VPBROADCASTB X2, Y2 -mulAvxTwo_3x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 +mulAvxTwo_3x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (DI), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x2Xor_loop + VZEROUPPER + +mulAvxTwo_3x2Xor_end: + RET + +// func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 33 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_3x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_3x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_3x2_64Xor_end: + RET + +// func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x3(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x3_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X3 + VPBROADCASTB X3, Y3 +mulAvxTwo_3x3_loop: // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX @@ -3596,20 +6784,17 @@ mulAvxTwo_3x3_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + VPXOR Y4, Y5, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + VPXOR Y4, Y5, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y4, Y5, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -3681,44 +6866,42 @@ mulAvxTwo_3x3_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x3_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 26 YMM used + // Destination kept in GP registers + // Full registers estimated 46 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), AX - MOVQ out_base+48(FP), SI - MOVQ out_base+48(FP), SI - MOVQ start+72(FP), DI + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI // Add start offset to input - ADDQ DI, DX - ADDQ DI, BX - ADDQ DI, AX - MOVQ $0x0000000f, R8 - MOVQ R8, X6 + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R8 - SHRQ $0x06, R8 mulAvxTwo_3x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -3731,35 +6914,29 @@ mulAvxTwo_3x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -3798,9 +6975,9 @@ mulAvxTwo_3x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -3839,36 +7016,35 @@ mulAvxTwo_3x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (SI), R9 - VMOVDQU Y0, (R9)(DI*1) - VMOVDQU Y1, 32(R9)(DI*1) - MOVQ 24(SI), R9 - VMOVDQU Y2, (R9)(DI*1) - VMOVDQU Y3, 32(R9)(DI*1) - MOVQ 48(SI), R9 - VMOVDQU Y4, (R9)(DI*1) - VMOVDQU Y5, 32(R9)(DI*1) - - // Prepare for next loop - ADDQ $0x40, DI - DECQ R8 + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y2, (R9) + VMOVDQU Y3, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y4, (DI) + VMOVDQU Y5, 32(DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX JNZ mulAvxTwo_3x3_64_loop VZEROUPPER mulAvxTwo_3x3_64_end: RET -// func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_3x4(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_3x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 33 YMM used + // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_3x4_end + JZ mulAvxTwo_3x3Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -3876,168 +7052,625 @@ TEXT ·mulAvxTwo_3x4(SB), NOSPLIT, $0-88 MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 - MOVQ 48(DI), R10 - MOVQ 72(DI), DI - MOVQ start+72(FP), R11 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 // Add start offset to output - ADDQ R11, R8 - ADDQ R11, R9 - ADDQ R11, R10 - ADDQ R11, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X4 - VPBROADCASTB X4, Y4 - -mulAvxTwo_3x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X3 + VPBROADCASTB X3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs - VMOVDQU (BX), Y7 +mulAvxTwo_3x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 ADDQ $0x20, BX - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU (CX), Y5 - VMOVDQU 32(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 64(CX), Y5 - VMOVDQU 96(CX), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 128(CX), Y5 - VMOVDQU 160(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 192(CX), Y5 - VMOVDQU 224(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (DI), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 1 to 4 outputs - VMOVDQU (SI), Y7 + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 ADDQ $0x20, SI - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 256(CX), Y5 - VMOVDQU 288(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 320(CX), Y5 - VMOVDQU 352(CX), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 384(CX), Y5 - VMOVDQU 416(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 448(CX), Y5 - VMOVDQU 480(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 2 to 4 outputs - VMOVDQU (DX), Y7 + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DX), Y6 ADDQ $0x20, DX - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 512(CX), Y5 - VMOVDQU 544(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 576(CX), Y5 - VMOVDQU 608(CX), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 640(CX), Y5 - VMOVDQU 672(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 704(CX), Y5 - VMOVDQU 736(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Store 4 outputs + // Store 3 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 - VMOVDQU Y2, (R10) - ADDQ $0x20, R10 - VMOVDQU Y3, (DI) + VMOVDQU Y2, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX - JNZ mulAvxTwo_3x4_loop + JNZ mulAvxTwo_3x3Xor_loop VZEROUPPER -mulAvxTwo_3x4_end: +mulAvxTwo_3x3Xor_end: RET -// func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_3x5(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_3x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 40 YMM used + // Full registers estimated 46 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX + SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_3x5_end + JZ mulAvxTwo_3x3_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 - MOVQ 48(DI), R10 - MOVQ 72(DI), R11 - MOVQ 96(DI), DI - MOVQ start+72(FP), R12 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 // Add start offset to output - ADDQ R12, R8 - ADDQ R12, R9 + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_3x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + VMOVDQU (R9), Y2 + VMOVDQU 32(R9), Y3 + VMOVDQU (DI), Y4 + VMOVDQU 32(DI), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y2, (R9) + VMOVDQU Y3, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y4, (DI) + VMOVDQU Y5, 32(DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_3x3_64Xor_end: + RET + +// func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x4(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 33 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x4_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_3x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x4_loop + VZEROUPPER + +mulAvxTwo_3x4_end: + RET + +// func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 33 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_3x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (DI), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x4Xor_loop + VZEROUPPER + +mulAvxTwo_3x4Xor_end: + RET + +// func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x5(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 40 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, DI @@ -4051,13 +7684,6 @@ TEXT ·mulAvxTwo_3x5(SB), NOSPLIT, $0-88 VPBROADCASTB X5, Y5 mulAvxTwo_3x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX @@ -4068,32 +7694,27 @@ mulAvxTwo_3x5_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -4189,17 +7810,17 @@ mulAvxTwo_3x5_loop: mulAvxTwo_3x5_end: RET -// func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_3x6(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_3x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 47 YMM used + // Full registers estimated 40 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_3x6_end + JZ mulAvxTwo_3x5Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -4209,71 +7830,430 @@ TEXT ·mulAvxTwo_3x6(SB), NOSPLIT, $0-88 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 - MOVQ 96(DI), R12 - MOVQ 120(DI), DI - MOVQ start+72(FP), R13 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 // Add start offset to output - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, R12 - ADDQ R13, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X6 - VPBROADCASTB X6, Y6 - -mulAvxTwo_3x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X5 + VPBROADCASTB X5, Y5 - // Load and process 32 bytes from input 0 to 6 outputs - VMOVDQU (BX), Y9 +mulAvxTwo_3x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 ADDQ $0x20, BX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU (DI), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x5Xor_loop + VZEROUPPER + +mulAvxTwo_3x5Xor_end: + RET + +// func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x6(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x6_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_3x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x6_loop + VZEROUPPER + +mulAvxTwo_3x6_end: + RET + +// func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_3x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 + VMOVDQU (R8), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 + VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 + VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 + VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 + VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 + VMOVDQU (DI), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 @@ -4383,10 +8363,10 @@ mulAvxTwo_3x6_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_3x6_loop + JNZ mulAvxTwo_3x6Xor_loop VZEROUPPER -mulAvxTwo_3x6_end: +mulAvxTwo_3x6Xor_end: RET // func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -4432,15 +8412,6 @@ TEXT ·mulAvxTwo_3x7(SB), NOSPLIT, $0-88 VPBROADCASTB X7, Y7 mulAvxTwo_3x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX @@ -4451,44 +8422,37 @@ mulAvxTwo_3x7_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -4612,17 +8576,17 @@ mulAvxTwo_3x7_loop: mulAvxTwo_3x7_end: RET -// func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_3x8(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_3x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 61 YMM used + // Full registers estimated 54 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_3x8_end + JZ mulAvxTwo_3x7Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -4634,104 +8598,307 @@ TEXT ·mulAvxTwo_3x8(SB), NOSPLIT, $0-88 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 - MOVQ 144(DI), R14 - MOVQ 168(DI), DI - MOVQ start+72(FP), R15 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 // Add start offset to output - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, R13 - ADDQ R15, R14 - ADDQ R15, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X8 - VPBROADCASTB X8, Y8 - -mulAvxTwo_3x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X7 + VPBROADCASTB X7, Y7 - // Load and process 32 bytes from input 0 to 8 outputs - VMOVDQU (BX), Y11 +mulAvxTwo_3x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 ADDQ $0x20, BX - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU (CX), Y9 - VMOVDQU 32(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 64(CX), Y9 - VMOVDQU 96(CX), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 128(CX), Y9 - VMOVDQU 160(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 192(CX), Y9 - VMOVDQU 224(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 256(CX), Y9 - VMOVDQU 288(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 320(CX), Y9 - VMOVDQU 352(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU (R12), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 384(CX), Y9 - VMOVDQU 416(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU (R13), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 448(CX), Y9 - VMOVDQU 480(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU (DI), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 1 to 8 outputs - VMOVDQU (SI), Y11 + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 ADDQ $0x20, SI - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 512(CX), Y9 - VMOVDQU 544(CX), Y10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x7Xor_loop + VZEROUPPER + +mulAvxTwo_3x7Xor_end: + RET + +// func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x8(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 61 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x8_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_3x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 @@ -4858,6 +9025,250 @@ mulAvxTwo_3x8_loop: mulAvxTwo_3x8_end: RET +// func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 61 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_3x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU (R12), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU (R13), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU (R14), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU (DI), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (R14) + ADDQ $0x20, R14 + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x8Xor_loop + VZEROUPPER + +mulAvxTwo_3x8Xor_end: + RET + // func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x9(SB), NOSPLIT, $8-88 @@ -4905,17 +9316,6 @@ TEXT ·mulAvxTwo_3x9(SB), NOSPLIT, $8-88 VPBROADCASTB X9, Y9 mulAvxTwo_3x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX @@ -4926,56 +9326,47 @@ mulAvxTwo_3x9_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -5127,36 +9518,34 @@ mulAvxTwo_3x9_loop: mulAvxTwo_3x9_end: RET -// func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_3x10(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_3x9Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 75 YMM used + // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_3x10_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), AX - MOVQ out_base+48(FP), SI - MOVQ (SI), DI - MOVQ 24(SI), R8 - MOVQ 48(SI), R9 - MOVQ 72(SI), R10 - MOVQ 96(SI), R11 - MOVQ 120(SI), R12 - MOVQ 144(SI), R13 - MOVQ 168(SI), R14 - MOVQ 192(SI), R15 - MOVQ 216(SI), SI + JZ mulAvxTwo_3x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output - ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 @@ -5165,97 +9554,344 @@ TEXT ·mulAvxTwo_3x10(SB), NOSPLIT, $8-88 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 - ADDQ BP, SI + ADDQ BP, DI // Add start offset to input - ADDQ BP, DX ADDQ BP, BX - ADDQ BP, AX + ADDQ BP, SI + ADDQ BP, DX MOVQ $0x0000000f, BP - MOVQ BP, X10 - VPBROADCASTB X10, Y10 - MOVQ n+80(FP), BP - SHRQ $0x05, BP - -mulAvxTwo_3x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 + MOVQ BP, X9 + VPBROADCASTB X9, Y9 - // Load and process 32 bytes from input 0 to 10 outputs - VMOVDQU (DX), Y13 - ADDQ $0x20, DX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU (CX), Y11 +mulAvxTwo_3x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU (R12), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU (R13), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU (R14), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU (R15), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU (DI), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (R14) + ADDQ $0x20, R14 + VMOVDQU Y7, (R15) + ADDQ $0x20, R15 + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x9Xor_loop + VZEROUPPER + +mulAvxTwo_3x9Xor_end: + RET + +// func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x10(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 75 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x10_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X10 + VPBROADCASTB X10, Y10 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_3x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (BX), Y13 @@ -5421,146 +10057,432 @@ mulAvxTwo_3x10_loop: mulAvxTwo_3x10_end: RET -// func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_4x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 12 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), CX - MOVQ out_base+48(FP), DI - MOVQ (DI), DI - MOVQ start+72(FP), R8 - - // Add start offset to output - ADDQ R8, DI - - // Add start offset to input - ADDQ R8, DX - ADDQ R8, BX - ADDQ R8, SI - ADDQ R8, CX - MOVQ $0x0000000f, R8 - MOVQ R8, X9 - VPBROADCASTB X9, Y9 - -mulAvxTwo_4x1_loop: - // Clear 1 outputs - VPXOR Y8, Y8, Y8 - - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y10 - ADDQ $0x20, DX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y0, Y10 - VPSHUFB Y11, Y1, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y10 - ADDQ $0x20, BX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y2, Y10 - VPSHUFB Y11, Y3, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y10 - ADDQ $0x20, SI - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y4, Y10 - VPSHUFB Y11, Y5, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (CX), Y10 - ADDQ $0x20, CX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y6, Y10 - VPSHUFB Y11, Y7, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 - - // Store 1 outputs - VMOVDQU Y8, (DI) - ADDQ $0x20, DI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_4x1_loop - VZEROUPPER - -mulAvxTwo_4x1_end: - RET - -// func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_4x1_64(SB), $0-88 +TEXT ·mulAvxTwo_3x10Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 12 YMM used + // Destination kept in GP registers + // Full registers estimated 75 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX + SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_4x1_64_end + JZ mulAvxTwo_3x10Xor_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), AX - MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI - MOVQ start+72(FP), R8 + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI // Add start offset to input - ADDQ R8, DX - ADDQ R8, BX - ADDQ R8, SI - ADDQ R8, AX - MOVQ $0x0000000f, R9 - MOVQ R9, X2 - VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R9 - SHRQ $0x06, R9 + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X10 + VPBROADCASTB X10, Y10 + MOVQ n+80(FP), BP + SHRQ $0x05, BP -mulAvxTwo_4x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 +mulAvxTwo_3x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU (R12), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU (R13), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU (R14), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU (R15), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU (SI), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y13 + ADDQ $0x20, AX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (R14) + ADDQ $0x20, R14 + VMOVDQU Y8, (R15) + ADDQ $0x20, R15 + VMOVDQU Y9, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_3x10Xor_loop + VZEROUPPER + +mulAvxTwo_3x10Xor_end: + RET + +// func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x1(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x1_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + MOVQ $0x0000000f, R8 + MOVQ R8, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_4x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VPSHUFB Y10, Y0, Y10 + VPSHUFB Y11, Y1, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VPSHUFB Y10, Y2, Y10 + VPSHUFB Y11, Y3, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VPSHUFB Y10, Y4, Y10 + VPSHUFB Y11, Y5, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VPSHUFB Y10, Y6, Y10 + VPSHUFB Y11, Y7, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 1 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x1_loop + VZEROUPPER + +mulAvxTwo_4x1_end: + RET + +// func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_4x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_4x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -5573,15 +10495,13 @@ mulAvxTwo_4x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -5600,9 +10520,9 @@ mulAvxTwo_4x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y6 - VMOVDQU 32(SI), Y5 - ADDQ $0x40, SI + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -5621,9 +10541,9 @@ mulAvxTwo_4x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -5642,95 +10562,321 @@ mulAvxTwo_4x1_64_loop: VPXOR Y5, Y1, Y1 // Store 1 outputs - MOVQ (DI), R10 - VMOVDQU Y0, (R10)(R8*1) - VMOVDQU Y1, 32(R10)(R8*1) + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 // Prepare for next loop - ADDQ $0x40, R8 - DECQ R9 + DECQ AX JNZ mulAvxTwo_4x1_64_loop VZEROUPPER mulAvxTwo_4x1_64_end: RET -// func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_4x2(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_4x1Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x1Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + MOVQ $0x0000000f, R8 + MOVQ R8, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_4x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VMOVDQU (DI), Y8 + VPSHUFB Y10, Y0, Y10 + VPSHUFB Y11, Y1, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VPSHUFB Y10, Y2, Y10 + VPSHUFB Y11, Y3, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VPSHUFB Y10, Y4, Y10 + VPSHUFB Y11, Y5, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VPSHUFB Y10, Y6, Y10 + VPSHUFB Y11, Y7, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 1 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x1Xor_loop + VZEROUPPER + +mulAvxTwo_4x1Xor_end: + RET + +// func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 23 YMM used + // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX + SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_4x2_end + JZ mulAvxTwo_4x1_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 - MOVQ (R8), R9 - MOVQ 24(R8), R8 - MOVQ start+72(FP), R10 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 // Add start offset to output - ADDQ R10, R9 - ADDQ R10, R8 + ADDQ R9, R8 // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X2 + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X2 VPBROADCASTB X2, Y2 -mulAvxTwo_4x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 +mulAvxTwo_4x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_4x1_64Xor_end: + RET + +// func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x2(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x2_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_4x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -5788,41 +10934,128 @@ mulAvxTwo_4x2_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x2_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 23 YMM used + // Destination kept in GP registers + // Full registers estimated 41 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_4x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), AX - MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI - MOVQ start+72(FP), R8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 // Add start offset to input - ADDQ R8, DX - ADDQ R8, BX - ADDQ R8, SI - ADDQ R8, AX - MOVQ $0x0000000f, R9 - MOVQ R9, X4 + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R9 - SHRQ $0x06, R9 mulAvxTwo_4x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX @@ -5832,6 +11065,222 @@ mulAvxTwo_4x2_64_loop: VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y2, (R8) + VMOVDQU Y3, 32(R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x2_64_loop + VZEROUPPER + +mulAvxTwo_4x2_64_end: + RET + +// func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x2Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x2Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_4x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x2Xor_loop + VZEROUPPER + +mulAvxTwo_4x2Xor_end: + RET + +// func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 41 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_4x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_4x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R9), Y0 + VMOVDQU 32(R9), Y1 + VMOVDQU (R8), Y2 + VMOVDQU 32(R8), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 @@ -5854,9 +11303,9 @@ mulAvxTwo_4x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -5885,9 +11334,9 @@ mulAvxTwo_4x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (SI), Y9 - VMOVDQU 32(SI), Y11 - ADDQ $0x40, SI + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -5916,9 +11365,9 @@ mulAvxTwo_4x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -5947,20 +11396,19 @@ mulAvxTwo_4x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (DI), R10 - VMOVDQU Y0, (R10)(R8*1) - VMOVDQU Y1, 32(R10)(R8*1) - MOVQ 24(DI), R10 - VMOVDQU Y2, (R10)(R8*1) - VMOVDQU Y3, 32(R10)(R8*1) + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y2, (R8) + VMOVDQU Y3, 32(R8) + ADDQ $0x40, R8 // Prepare for next loop - ADDQ $0x40, R8 - DECQ R9 - JNZ mulAvxTwo_4x2_64_loop + DECQ AX + JNZ mulAvxTwo_4x2_64Xor_loop VZEROUPPER -mulAvxTwo_4x2_64_end: +mulAvxTwo_4x2_64Xor_end: RET // func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -6000,11 +11448,6 @@ TEXT ·mulAvxTwo_4x3(SB), NOSPLIT, $0-88 VPBROADCASTB X3, Y3 mulAvxTwo_4x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX @@ -6015,20 +11458,17 @@ mulAvxTwo_4x3_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + VPXOR Y4, Y5, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + VPXOR Y4, Y5, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y4, Y5, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -6125,46 +11565,44 @@ mulAvxTwo_4x3_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x3_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 32 YMM used + // Destination kept in GP registers + // Full registers estimated 58 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_4x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), AX - MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI - MOVQ start+72(FP), R8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 // Add start offset to input - ADDQ R8, DX - ADDQ R8, BX - ADDQ R8, SI - ADDQ R8, AX - MOVQ $0x0000000f, R9 - MOVQ R9, X6 + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R9 - SHRQ $0x06, R9 mulAvxTwo_4x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -6177,36 +11615,30 @@ mulAvxTwo_4x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX - VPSRLQ $0x04, Y11, Y12 + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 @@ -6244,6 +11676,350 @@ mulAvxTwo_4x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y2, (R10) + VMOVDQU Y3, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y4, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x3_64_loop + VZEROUPPER + +mulAvxTwo_4x3_64_end: + RET + +// func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x3Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x3Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_4x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (R8), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x3Xor_loop + VZEROUPPER + +mulAvxTwo_4x3Xor_end: + RET + +// func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_4x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_4x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R9), Y0 + VMOVDQU 32(R9), Y1 + VMOVDQU (R10), Y2 + VMOVDQU 32(R10), Y3 + VMOVDQU (R8), Y4 + VMOVDQU 32(R8), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI @@ -6253,6 +12029,47 @@ mulAvxTwo_4x3_64_loop: VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 @@ -6285,9 +12102,9 @@ mulAvxTwo_4x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -6326,23 +12143,22 @@ mulAvxTwo_4x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (DI), R10 - VMOVDQU Y0, (R10)(R8*1) - VMOVDQU Y1, 32(R10)(R8*1) - MOVQ 24(DI), R10 - VMOVDQU Y2, (R10)(R8*1) - VMOVDQU Y3, 32(R10)(R8*1) - MOVQ 48(DI), R10 - VMOVDQU Y4, (R10)(R8*1) - VMOVDQU Y5, 32(R10)(R8*1) - - // Prepare for next loop - ADDQ $0x40, R8 - DECQ R9 - JNZ mulAvxTwo_4x3_64_loop + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y2, (R10) + VMOVDQU Y3, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y4, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x3_64Xor_loop VZEROUPPER -mulAvxTwo_4x3_64_end: +mulAvxTwo_4x3_64Xor_end: RET // func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -6384,12 +12200,6 @@ TEXT ·mulAvxTwo_4x4(SB), NOSPLIT, $0-88 VPBROADCASTB X4, Y4 mulAvxTwo_4x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX @@ -6400,26 +12210,22 @@ mulAvxTwo_4x4_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -6532,17 +12338,17 @@ mulAvxTwo_4x4_loop: mulAvxTwo_4x4_end: RET -// func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_4x5(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_4x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 50 YMM used + // Full registers estimated 41 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_4x5_end + JZ mulAvxTwo_4x4Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -6552,89 +12358,262 @@ TEXT ·mulAvxTwo_4x5(SB), NOSPLIT, $0-88 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 - MOVQ 72(R8), R12 - MOVQ 96(R8), R8 - MOVQ start+72(FP), R13 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 // Add start offset to output - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, R12 - ADDQ R13, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X5 - VPBROADCASTB X5, Y5 - -mulAvxTwo_4x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X4 + VPBROADCASTB X4, Y4 - // Load and process 32 bytes from input 0 to 5 outputs - VMOVDQU (BX), Y8 +mulAvxTwo_4x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 ADDQ $0x20, BX - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU (CX), Y6 - VMOVDQU 32(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 64(CX), Y6 - VMOVDQU 96(CX), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 128(CX), Y6 - VMOVDQU 160(CX), Y7 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 256(CX), Y6 - VMOVDQU 288(CX), Y7 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (R8), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 1 to 5 outputs - VMOVDQU (SI), Y8 + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 ADDQ $0x20, SI - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 320(CX), Y6 - VMOVDQU 352(CX), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 384(CX), Y6 - VMOVDQU 416(CX), Y7 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x4Xor_loop + VZEROUPPER + +mulAvxTwo_4x4Xor_end: + RET + +// func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x5(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_4x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 @@ -6748,6 +12727,220 @@ mulAvxTwo_4x5_loop: mulAvxTwo_4x5_end: RET +// func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_4x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU (R8), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x5Xor_loop + VZEROUPPER + +mulAvxTwo_4x5Xor_end: + RET + // func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x6(SB), NOSPLIT, $0-88 @@ -6791,14 +12984,6 @@ TEXT ·mulAvxTwo_4x6(SB), NOSPLIT, $0-88 VPBROADCASTB X6, Y6 mulAvxTwo_4x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX @@ -6809,38 +12994,32 @@ mulAvxTwo_4x6_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -6993,17 +13172,17 @@ mulAvxTwo_4x6_loop: mulAvxTwo_4x6_end: RET -// func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_4x7(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_4x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 68 YMM used + // Full registers estimated 59 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_4x7_end + JZ mulAvxTwo_4x6Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -7015,87 +13194,314 @@ TEXT ·mulAvxTwo_4x7(SB), NOSPLIT, $0-88 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 - MOVQ 120(R8), R14 - MOVQ 144(R8), R8 - MOVQ start+72(FP), R15 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 // Add start offset to output - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, R13 - ADDQ R15, R14 - ADDQ R15, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X7 - VPBROADCASTB X7, Y7 - -mulAvxTwo_4x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 - // Load and process 32 bytes from input 0 to 7 outputs - VMOVDQU (BX), Y10 +mulAvxTwo_4x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 ADDQ $0x20, BX - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU (CX), Y8 - VMOVDQU 32(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 64(CX), Y8 - VMOVDQU 96(CX), Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 128(CX), Y8 - VMOVDQU 160(CX), Y9 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 192(CX), Y8 - VMOVDQU 224(CX), Y9 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 320(CX), Y8 - VMOVDQU 352(CX), Y9 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU (R13), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 384(CX), Y8 - VMOVDQU 416(CX), Y9 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU (R8), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 - + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x6Xor_loop + VZEROUPPER + +mulAvxTwo_4x6Xor_end: + RET + +// func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x7(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x7_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_4x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 + // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -7267,6 +13673,278 @@ mulAvxTwo_4x7_loop: mulAvxTwo_4x7_end: RET +// func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_4x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU (R13), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU (R14), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU (R8), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R14) + ADDQ $0x20, R14 + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x7Xor_loop + VZEROUPPER + +mulAvxTwo_4x7Xor_end: + RET + // func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x8(SB), NOSPLIT, $8-88 @@ -7314,16 +13992,6 @@ TEXT ·mulAvxTwo_4x8(SB), NOSPLIT, $8-88 VPBROADCASTB X8, Y8 mulAvxTwo_4x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX @@ -7334,50 +14002,42 @@ mulAvxTwo_4x8_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -7570,9 +14230,310 @@ mulAvxTwo_4x8_loop: mulAvxTwo_4x8_end: RET -// func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_4x9(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_4x8Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 77 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_4x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU (R13), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU (R14), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU (R15), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU (R8), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R14) + ADDQ $0x20, R14 + VMOVDQU Y6, (R15) + ADDQ $0x20, R15 + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x8Xor_loop + VZEROUPPER + +mulAvxTwo_4x8Xor_end: + RET + +// func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x9(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 86 YMM used @@ -7621,17 +14582,6 @@ TEXT ·mulAvxTwo_4x9(SB), NOSPLIT, $8-88 SHRQ $0x05, BP mulAvxTwo_4x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -7642,56 +14592,47 @@ mulAvxTwo_4x9_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (BX), Y12 @@ -7904,134 +14845,444 @@ mulAvxTwo_4x9_loop: mulAvxTwo_4x9_end: RET -// func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_4x10(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_4x9Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 95 YMM used + // Destination kept in GP registers + // Full registers estimated 86 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_4x10_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ start+72(FP), R9 + JZ mulAvxTwo_4x9Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP - // Add start offset to input - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DI - ADDQ R9, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X10 - VPBROADCASTB X10, Y10 + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI -mulAvxTwo_4x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X9 + VPBROADCASTB X9, Y9 + MOVQ n+80(FP), BP + SHRQ $0x05, BP - // Load and process 32 bytes from input 0 to 10 outputs - VMOVDQU (BX), Y13 - ADDQ $0x20, BX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU (CX), Y11 - VMOVDQU 32(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 64(CX), Y11 - VMOVDQU 96(CX), Y12 +mulAvxTwo_4x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 128(CX), Y11 - VMOVDQU 160(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 192(CX), Y11 - VMOVDQU 224(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 256(CX), Y11 - VMOVDQU 288(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 320(CX), Y11 - VMOVDQU 352(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU (R12), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 384(CX), Y11 - VMOVDQU 416(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU (R13), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 448(CX), Y11 - VMOVDQU 480(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU (R14), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 512(CX), Y11 - VMOVDQU 544(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU (R15), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 576(CX), Y11 - VMOVDQU 608(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU (DI), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 1 to 10 outputs - VMOVDQU (SI), Y13 - ADDQ $0x20, SI - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 640(CX), Y11 - VMOVDQU 672(CX), Y12 + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 704(CX), Y11 - VMOVDQU 736(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 768(CX), Y11 - VMOVDQU 800(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y12 + ADDQ $0x20, AX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (R14) + ADDQ $0x20, R14 + VMOVDQU Y7, (R15) + ADDQ $0x20, R15 + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_4x9Xor_loop + VZEROUPPER + +mulAvxTwo_4x9Xor_end: + RET + +// func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x10(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 95 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_4x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 @@ -8244,167 +15495,509 @@ mulAvxTwo_4x10_loop: mulAvxTwo_4x10_end: RET -// func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_5x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 14 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), CX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R8 - MOVQ start+72(FP), R9 - - // Add start offset to output - ADDQ R9, R8 - - // Add start offset to input - ADDQ R9, DX - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DI - ADDQ R9, CX - MOVQ $0x0000000f, R9 - MOVQ R9, X11 - VPBROADCASTB X11, Y11 - -mulAvxTwo_5x1_loop: - // Clear 1 outputs - VPXOR Y10, Y10, Y10 - - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y12 - ADDQ $0x20, DX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y0, Y12 - VPSHUFB Y13, Y1, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y12 - ADDQ $0x20, BX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y2, Y12 - VPSHUFB Y13, Y3, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y12 - ADDQ $0x20, SI - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y4, Y12 - VPSHUFB Y13, Y5, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y12 - ADDQ $0x20, DI - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y6, Y12 - VPSHUFB Y13, Y7, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (CX), Y12 - ADDQ $0x20, CX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y8, Y12 - VPSHUFB Y13, Y9, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 - - // Store 1 outputs - VMOVDQU Y10, (R8) - ADDQ $0x20, R8 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_5x1_loop - VZEROUPPER - -mulAvxTwo_5x1_end: - RET - -// func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_5x1_64(SB), $0-88 +TEXT ·mulAvxTwo_4x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 14 YMM used + // Full registers estimated 95 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX + SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_5x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), AX - MOVQ out_base+48(FP), R8 + JZ mulAvxTwo_4x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ start+72(FP), R9 // Add start offset to input - ADDQ R9, DX ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI - ADDQ R9, AX + ADDQ R9, DX MOVQ $0x0000000f, R10 - MOVQ R10, X2 - VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R10 - SHRQ $0x06, R10 + MOVQ R10, X10 + VPBROADCASTB X10, Y10 -mulAvxTwo_5x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 +mulAvxTwo_4x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R8), R10 + VMOVDQU (R10)(R9*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + MOVQ 24(R8), R10 + VMOVDQU (R10)(R9*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + MOVQ 48(R8), R10 + VMOVDQU (R10)(R9*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + MOVQ 72(R8), R10 + VMOVDQU (R10)(R9*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + MOVQ 96(R8), R10 + VMOVDQU (R10)(R9*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + MOVQ 120(R8), R10 + VMOVDQU (R10)(R9*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + MOVQ 144(R8), R10 + VMOVDQU (R10)(R9*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + MOVQ 168(R8), R10 + VMOVDQU (R10)(R9*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + MOVQ 192(R8), R10 + VMOVDQU (R10)(R9*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + MOVQ 216(R8), R10 + VMOVDQU (R10)(R9*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y0, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y1, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y2, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y3, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxTwo_4x10Xor_loop + VZEROUPPER + +mulAvxTwo_4x10Xor_end: + RET + +// func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x1(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x1_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + MOVQ $0x0000000f, R9 + MOVQ R9, X11 + VPBROADCASTB X11, Y11 + +mulAvxTwo_5x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y0, Y12 + VPSHUFB Y13, Y1, Y13 + VPXOR Y12, Y13, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y2, Y12 + VPSHUFB Y13, Y3, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y4, Y12 + VPSHUFB Y13, Y5, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y6, Y12 + VPSHUFB Y13, Y7, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y8, Y12 + VPSHUFB Y13, Y9, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x1_loop + VZEROUPPER + +mulAvxTwo_5x1_end: + RET + +// func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_5x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_5x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 @@ -8413,15 +16006,13 @@ mulAvxTwo_5x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -8440,9 +16031,9 @@ mulAvxTwo_5x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y6 - VMOVDQU 32(SI), Y5 - ADDQ $0x40, SI + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -8461,9 +16052,9 @@ mulAvxTwo_5x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y6 - VMOVDQU 32(DI), Y5 - ADDQ $0x40, DI + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -8482,9 +16073,9 @@ mulAvxTwo_5x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -8503,83 +16094,347 @@ mulAvxTwo_5x1_64_loop: VPXOR Y5, Y1, Y1 // Store 1 outputs - MOVQ (R8), R11 - VMOVDQU Y0, (R11)(R9*1) - VMOVDQU Y1, 32(R11)(R9*1) + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 // Prepare for next loop - ADDQ $0x40, R9 - DECQ R10 + DECQ AX JNZ mulAvxTwo_5x1_64_loop VZEROUPPER mulAvxTwo_5x1_64_end: RET -// func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_5x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers +TEXT ·mulAvxTwo_5x1Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 27 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), DX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R10 - MOVQ 24(R9), R9 - MOVQ start+72(FP), R11 + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x1Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 // Add start offset to output - ADDQ R11, R10 - ADDQ R11, R9 + ADDQ R9, R8 // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X2 - VPBROADCASTB X2, Y2 + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + MOVQ $0x0000000f, R9 + MOVQ R9, X11 + VPBROADCASTB X11, Y11 -mulAvxTwo_5x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 +mulAvxTwo_5x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VMOVDQU (R8), Y10 + VPSHUFB Y12, Y0, Y12 + VPSHUFB Y13, Y1, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y12 ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y2, Y12 + VPSHUFB Y13, Y3, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y4, Y12 + VPSHUFB Y13, Y5, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y6, Y12 + VPSHUFB Y13, Y7, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y8, Y12 + VPSHUFB Y13, Y9, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x1Xor_loop + VZEROUPPER + +mulAvxTwo_5x1Xor_end: + RET + +// func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_5x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_5x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R9), Y0 + VMOVDQU 32(R9), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_5x1_64Xor_end: + RET + +// func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x2(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x2_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_5x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 128(CX), Y3 @@ -8670,46 +16525,44 @@ mulAvxTwo_5x2_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x2_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 27 YMM used + // Destination kept in GP registers + // Full registers estimated 49 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_5x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), AX - MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 - MOVQ start+72(FP), R9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 // Add start offset to input - ADDQ R9, DX - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DI - ADDQ R9, AX - MOVQ $0x0000000f, R10 - MOVQ R10, X4 + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R10 - SHRQ $0x06, R10 mulAvxTwo_5x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -8722,25 +16575,21 @@ mulAvxTwo_5x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -8769,9 +16618,9 @@ mulAvxTwo_5x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (SI), Y9 - VMOVDQU 32(SI), Y11 - ADDQ $0x40, SI + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -8800,9 +16649,9 @@ mulAvxTwo_5x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs - VMOVDQU (DI), Y9 - VMOVDQU 32(DI), Y11 - ADDQ $0x40, DI + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -8831,9 +16680,9 @@ mulAvxTwo_5x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -8862,33 +16711,32 @@ mulAvxTwo_5x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (R8), R11 - VMOVDQU Y0, (R11)(R9*1) - VMOVDQU Y1, 32(R11)(R9*1) - MOVQ 24(R8), R11 - VMOVDQU Y2, (R11)(R9*1) - VMOVDQU Y3, 32(R11)(R9*1) + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y2, (R9) + VMOVDQU Y3, 32(R9) + ADDQ $0x40, R9 // Prepare for next loop - ADDQ $0x40, R9 - DECQ R10 + DECQ AX JNZ mulAvxTwo_5x2_64_loop VZEROUPPER mulAvxTwo_5x2_64_end: RET -// func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_5x3(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_5x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 38 YMM used + // Full registers estimated 27 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_5x3_end + JZ mulAvxTwo_5x2Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -8897,129 +16745,484 @@ TEXT ·mulAvxTwo_5x3(SB), NOSPLIT, $0-88 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 - MOVQ 24(R9), R11 - MOVQ 48(R9), R9 - MOVQ start+72(FP), R12 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 // Add start offset to output - ADDQ R12, R10 - ADDQ R12, R11 - ADDQ R12, R9 + ADDQ R11, R10 + ADDQ R11, R9 // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_5x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X2 + VPBROADCASTB X2, Y2 - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 +mulAvxTwo_5x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x2Xor_loop + VZEROUPPER + +mulAvxTwo_5x2Xor_end: + RET + +// func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 49 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_5x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_5x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R10), Y0 + VMOVDQU 32(R10), Y1 + VMOVDQU (R9), Y2 + VMOVDQU 32(R9), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y2, (R9) + VMOVDQU Y3, 32(R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_5x2_64Xor_end: + RET + +// func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x3(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x3_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_5x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y6 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs @@ -9067,48 +17270,46 @@ mulAvxTwo_5x3_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x3_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 38 YMM used + // Destination kept in GP registers + // Full registers estimated 70 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_5x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), AX - MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 - MOVQ start+72(FP), R9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 // Add start offset to input - ADDQ R9, DX - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DI - ADDQ R9, AX - MOVQ $0x0000000f, R10 - MOVQ R10, X6 + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R10 - SHRQ $0x06, R10 mulAvxTwo_5x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -9121,35 +17322,29 @@ mulAvxTwo_5x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -9188,9 +17383,9 @@ mulAvxTwo_5x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y11 - VMOVDQU 32(SI), Y13 - ADDQ $0x40, SI + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -9229,9 +17424,9 @@ mulAvxTwo_5x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y11 - VMOVDQU 32(DI), Y13 - ADDQ $0x40, DI + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -9270,9 +17465,9 @@ mulAvxTwo_5x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -9311,36 +17506,35 @@ mulAvxTwo_5x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (R8), R11 - VMOVDQU Y0, (R11)(R9*1) - VMOVDQU Y1, 32(R11)(R9*1) - MOVQ 24(R8), R11 - VMOVDQU Y2, (R11)(R9*1) - VMOVDQU Y3, 32(R11)(R9*1) - MOVQ 48(R8), R11 - VMOVDQU Y4, (R11)(R9*1) - VMOVDQU Y5, 32(R11)(R9*1) - - // Prepare for next loop - ADDQ $0x40, R9 - DECQ R10 + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y2, (R11) + VMOVDQU Y3, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y4, (R9) + VMOVDQU Y5, 32(R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX JNZ mulAvxTwo_5x3_64_loop VZEROUPPER mulAvxTwo_5x3_64_end: RET -// func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_5x4(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_5x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 49 YMM used + // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_5x4_end + JZ mulAvxTwo_5x3Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -9350,63 +17544,508 @@ TEXT ·mulAvxTwo_5x4(SB), NOSPLIT, $0-88 MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 - MOVQ 48(R9), R12 - MOVQ 72(R9), R9 - MOVQ start+72(FP), R13 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 // Add start offset to output - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, R12 - ADDQ R13, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X4 - VPBROADCASTB X4, Y4 - -mulAvxTwo_5x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X3 + VPBROADCASTB X3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs - VMOVDQU (BX), Y7 +mulAvxTwo_5x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 ADDQ $0x20, BX - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU (CX), Y5 - VMOVDQU 32(CX), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y6 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x3Xor_loop + VZEROUPPER + +mulAvxTwo_5x3Xor_end: + RET + +// func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 70 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_5x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_5x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R10), Y0 + VMOVDQU 32(R10), Y1 + VMOVDQU (R11), Y2 + VMOVDQU 32(R11), Y3 + VMOVDQU (R9), Y4 + VMOVDQU 32(R9), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y2, (R11) + VMOVDQU Y3, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y4, (R9) + VMOVDQU Y5, 32(R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_5x3_64Xor_end: + RET + +// func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x4(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 49 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x4_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_5x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -9550,17 +18189,17 @@ mulAvxTwo_5x4_loop: mulAvxTwo_5x4_end: RET -// func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_5x5(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_5x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 60 YMM used + // Full registers estimated 49 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_5x5_end + JZ mulAvxTwo_5x4Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -9571,65 +18210,524 @@ TEXT ·mulAvxTwo_5x5(SB), NOSPLIT, $0-88 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 - MOVQ 72(R9), R13 - MOVQ 96(R9), R9 - MOVQ start+72(FP), R14 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 // Add start offset to output - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, R13 - ADDQ R14, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X5 - VPBROADCASTB X5, Y5 - -mulAvxTwo_5x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X4 + VPBROADCASTB X4, Y4 - // Load and process 32 bytes from input 0 to 5 outputs - VMOVDQU (BX), Y8 +mulAvxTwo_5x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 ADDQ $0x20, BX - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU (CX), Y6 - VMOVDQU 32(CX), Y7 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 64(CX), Y6 - VMOVDQU 96(CX), Y7 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 128(CX), Y6 - VMOVDQU 160(CX), Y7 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (R9), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x4Xor_loop + VZEROUPPER + +mulAvxTwo_5x4Xor_end: + RET + +// func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x5(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 60 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_5x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x5_loop + VZEROUPPER + +mulAvxTwo_5x5_end: + RET + +// func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 60 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_5x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 + VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 + VMOVDQU (R9), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 @@ -9799,10 +18897,10 @@ mulAvxTwo_5x5_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_5x5_loop + JNZ mulAvxTwo_5x5Xor_loop VZEROUPPER -mulAvxTwo_5x5_end: +mulAvxTwo_5x5Xor_end: RET // func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -9850,14 +18948,6 @@ TEXT ·mulAvxTwo_5x6(SB), NOSPLIT, $0-88 VPBROADCASTB X6, Y6 mulAvxTwo_5x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX @@ -9868,38 +18958,32 @@ mulAvxTwo_5x6_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -10095,17 +19179,17 @@ mulAvxTwo_5x6_loop: mulAvxTwo_5x6_end: RET -// func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_5x7(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_5x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 82 YMM used + // Full registers estimated 71 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_5x7_end + JZ mulAvxTwo_5x6Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -10118,42 +19202,321 @@ TEXT ·mulAvxTwo_5x7(SB), NOSPLIT, $8-88 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 - MOVQ 120(R9), R15 - MOVQ 144(R9), R9 - MOVQ start+72(FP), BP + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 // Add start offset to output - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, R13 - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X7 - VPBROADCASTB X7, Y7 - -mulAvxTwo_5x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 - // Load and process 32 bytes from input 0 to 7 outputs - VMOVDQU (BX), Y10 - ADDQ $0x20, BX +mulAvxTwo_5x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU (R13), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU (R14), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU (R9), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R14) + ADDQ $0x20, R14 + VMOVDQU Y5, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x6Xor_loop + VZEROUPPER + +mulAvxTwo_5x6Xor_end: + RET + +// func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x7(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x7_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_5x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 @@ -10161,44 +19524,37 @@ mulAvxTwo_5x7_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -10420,168 +19776,473 @@ mulAvxTwo_5x7_loop: mulAvxTwo_5x7_end: RET -// func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_5x8(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_5x7Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 93 YMM used + // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_5x8_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), AX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R9 - MOVQ 24(R8), R10 - MOVQ 48(R8), R11 - MOVQ 72(R8), R12 - MOVQ 96(R8), R13 - MOVQ 120(R8), R14 - MOVQ 144(R8), R15 - MOVQ 168(R8), R8 + JZ mulAvxTwo_5x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output - ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 - ADDQ BP, R8 + ADDQ BP, R9 // Add start offset to input - ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI - ADDQ BP, AX + ADDQ BP, R8 + ADDQ BP, DX MOVQ $0x0000000f, BP - MOVQ BP, X8 - VPBROADCASTB X8, Y8 - MOVQ n+80(FP), BP - SHRQ $0x05, BP - -mulAvxTwo_5x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 + MOVQ BP, X7 + VPBROADCASTB X7, Y7 - // Load and process 32 bytes from input 0 to 8 outputs - VMOVDQU (DX), Y11 - ADDQ $0x20, DX - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU (CX), Y9 - VMOVDQU 32(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 64(CX), Y9 - VMOVDQU 96(CX), Y10 +mulAvxTwo_5x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 128(CX), Y9 - VMOVDQU 160(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 192(CX), Y9 - VMOVDQU 224(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 256(CX), Y9 - VMOVDQU 288(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU (R13), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 320(CX), Y9 - VMOVDQU 352(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU (R14), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 384(CX), Y9 - VMOVDQU 416(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU (R15), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 448(CX), Y9 - VMOVDQU 480(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU (R9), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 1 to 8 outputs - VMOVDQU (BX), Y11 - ADDQ $0x20, BX - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 512(CX), Y9 - VMOVDQU 544(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 576(CX), Y9 - VMOVDQU 608(CX), Y10 + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 640(CX), Y9 - VMOVDQU 672(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 704(CX), Y9 - VMOVDQU 736(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 768(CX), Y9 - VMOVDQU 800(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 832(CX), Y9 - VMOVDQU 864(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 896(CX), Y9 - VMOVDQU 928(CX), Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R14) + ADDQ $0x20, R14 + VMOVDQU Y5, (R15) + ADDQ $0x20, R15 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x7Xor_loop + VZEROUPPER + +mulAvxTwo_5x7Xor_end: + RET + +// func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x8(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 93 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x8_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X8 + VPBROADCASTB X8, Y8 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_5x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 @@ -10782,129 +20443,469 @@ mulAvxTwo_5x8_loop: mulAvxTwo_5x8_end: RET -// func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_5x9(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_5x8Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 104 YMM used + // Destination kept in GP registers + // Full registers estimated 93 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_5x9_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), DX - MOVQ out_base+48(FP), R9 - MOVQ start+72(FP), R10 + JZ mulAvxTwo_5x8Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP - // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, R8 - ADDQ R10, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X9 - VPBROADCASTB X9, Y9 + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 -mulAvxTwo_5x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X8 + VPBROADCASTB X8, Y8 + MOVQ n+80(FP), BP + SHRQ $0x05, BP - // Load and process 32 bytes from input 0 to 9 outputs - VMOVDQU (BX), Y12 - ADDQ $0x20, BX - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU (CX), Y10 - VMOVDQU 32(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 64(CX), Y10 - VMOVDQU 96(CX), Y11 +mulAvxTwo_5x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 128(CX), Y10 - VMOVDQU 160(CX), Y11 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 192(CX), Y10 - VMOVDQU 224(CX), Y11 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 256(CX), Y10 - VMOVDQU 288(CX), Y11 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 320(CX), Y10 - VMOVDQU 352(CX), Y11 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU (R13), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 384(CX), Y10 - VMOVDQU 416(CX), Y11 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU (R14), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 448(CX), Y10 - VMOVDQU 480(CX), Y11 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU (R15), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 512(CX), Y10 - VMOVDQU 544(CX), Y11 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU (R8), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 1 to 9 outputs - VMOVDQU (SI), Y12 - ADDQ $0x20, SI - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 576(CX), Y10 - VMOVDQU 608(CX), Y11 + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 640(CX), Y10 - VMOVDQU 672(CX), Y11 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 704(CX), Y10 - VMOVDQU 736(CX), Y11 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y11 + ADDQ $0x20, AX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R14) + ADDQ $0x20, R14 + VMOVDQU Y6, (R15) + ADDQ $0x20, R15 + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_5x8Xor_loop + VZEROUPPER + +mulAvxTwo_5x8Xor_end: + RET + +// func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x9(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 104 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x9_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_5x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 @@ -11158,17 +21159,17 @@ mulAvxTwo_5x9_loop: mulAvxTwo_5x9_end: RET -// func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_5x10(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_5x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 115 YMM used + // Full registers estimated 104 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_5x10_end + JZ mulAvxTwo_5x9Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -11185,132 +21186,493 @@ TEXT ·mulAvxTwo_5x10(SB), NOSPLIT, $0-88 ADDQ R10, R8 ADDQ R10, DX MOVQ $0x0000000f, R11 - MOVQ R11, X10 - VPBROADCASTB X10, Y10 - -mulAvxTwo_5x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 + MOVQ R11, X9 + VPBROADCASTB X9, Y9 - // Load and process 32 bytes from input 0 to 10 outputs - VMOVDQU (BX), Y13 +mulAvxTwo_5x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 ADDQ $0x20, BX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU (CX), Y11 - VMOVDQU 32(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 64(CX), Y11 - VMOVDQU 96(CX), Y12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 128(CX), Y11 - VMOVDQU 160(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 192(CX), Y11 - VMOVDQU 224(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 256(CX), Y11 - VMOVDQU 288(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 320(CX), Y11 - VMOVDQU 352(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 384(CX), Y11 - VMOVDQU 416(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 448(CX), Y11 - VMOVDQU 480(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 512(CX), Y11 - VMOVDQU 544(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 576(CX), Y11 - VMOVDQU 608(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 1 to 10 outputs - VMOVDQU (SI), Y13 + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 ADDQ $0x20, SI - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 640(CX), Y11 - VMOVDQU 672(CX), Y12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 704(CX), Y11 - VMOVDQU 736(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 768(CX), Y11 - VMOVDQU 800(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 832(CX), Y11 - VMOVDQU 864(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 896(CX), Y11 - VMOVDQU 928(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 960(CX), Y11 - VMOVDQU 992(CX), Y12 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 1024(CX), Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y0, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y1, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y2, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y3, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_5x9Xor_loop + VZEROUPPER + +mulAvxTwo_5x9Xor_end: + RET + +// func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x10(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 115 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_5x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 @@ -11567,508 +21929,760 @@ mulAvxTwo_5x10_loop: mulAvxTwo_5x10_end: RET -// func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_6x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 16 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - VMOVDQU 320(CX), Y10 - VMOVDQU 352(CX), Y11 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), R8 - MOVQ 120(CX), CX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R9 - MOVQ start+72(FP), R10 - - // Add start offset to output - ADDQ R10, R9 - - // Add start offset to input - ADDQ R10, DX - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, R8 - ADDQ R10, CX - MOVQ $0x0000000f, R10 - MOVQ R10, X13 - VPBROADCASTB X13, Y13 - -mulAvxTwo_6x1_loop: - // Clear 1 outputs - VPXOR Y12, Y12, Y12 - - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y14 - ADDQ $0x20, DX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y0, Y14 - VPSHUFB Y15, Y1, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y14 - ADDQ $0x20, BX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y2, Y14 - VPSHUFB Y15, Y3, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y14 - ADDQ $0x20, SI - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y4, Y14 - VPSHUFB Y15, Y5, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y14 - ADDQ $0x20, DI - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y6, Y14 - VPSHUFB Y15, Y7, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y14 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y8, Y14 - VPSHUFB Y15, Y9, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (CX), Y14 - ADDQ $0x20, CX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y10, Y14 - VPSHUFB Y15, Y11, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 - - // Store 1 outputs - VMOVDQU Y12, (R9) - ADDQ $0x20, R9 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_6x1_loop - VZEROUPPER - -mulAvxTwo_6x1_end: - RET - -// func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_6x1_64(SB), $0-88 +TEXT ·mulAvxTwo_5x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 16 YMM used + // Full registers estimated 115 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX + SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_6x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), AX - MOVQ out_base+48(FP), R9 + JZ mulAvxTwo_5x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input - ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 - ADDQ R10, AX + ADDQ R10, DX MOVQ $0x0000000f, R11 - MOVQ R11, X2 - VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R11 - SHRQ $0x06, R11 - -mulAvxTwo_6x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 + MOVQ R11, X10 + VPBROADCASTB X10, Y10 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +mulAvxTwo_5x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + MOVQ 216(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y6 - VMOVDQU 32(SI), Y5 - ADDQ $0x40, SI - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y6 - VMOVDQU 32(DI), Y5 - ADDQ $0x40, DI - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 64 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y6 - VMOVDQU 32(R8), Y5 - ADDQ $0x40, R8 - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 64 bytes from input 5 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Store 1 outputs - MOVQ (R9), R12 - VMOVDQU Y0, (R12)(R10*1) - VMOVDQU Y1, 32(R12)(R10*1) + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y0, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y1, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y2, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y3, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y9, (R11)(R10*1) // Prepare for next loop - ADDQ $0x40, R10 - DECQ R11 - JNZ mulAvxTwo_6x1_64_loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_5x10Xor_loop VZEROUPPER -mulAvxTwo_6x1_64_end: +mulAvxTwo_5x10Xor_end: RET -// func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_6x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers +TEXT ·mulAvxTwo_6x1(SB), NOSPLIT, $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 31 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), DX - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R10 - MOVQ start+72(FP), R12 - - // Add start offset to output - ADDQ R12, R11 - ADDQ R12, R10 - - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X2 - VPBROADCASTB X2, Y2 + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x1_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 -mulAvxTwo_6x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 + // Add start offset to output + ADDQ R10, R9 - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + MOVQ $0x0000000f, R10 + MOVQ R10, X13 + VPBROADCASTB X13, Y13 + +mulAvxTwo_6x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y0, Y14 + VPSHUFB Y15, Y1, Y15 + VPXOR Y14, Y15, Y12 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y14 ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y2, Y14 + VPSHUFB Y15, Y3, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y4, Y14 + VPSHUFB Y15, Y5, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y6, Y14 + VPSHUFB Y15, Y7, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y8, Y14 + VPSHUFB Y15, Y9, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y10, Y14 + VPSHUFB Y15, Y11, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 + + // Store 1 outputs + VMOVDQU Y12, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x1_loop + VZEROUPPER + +mulAvxTwo_6x1_end: + RET + +// func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_6x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_6x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y5, Y1, Y1 - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y5, Y1, Y1 - // Store 2 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - VMOVDQU Y1, (R10) - ADDQ $0x20, R10 + // Store 1 outputs + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_6x2_loop + JNZ mulAvxTwo_6x1_64_loop VZEROUPPER -mulAvxTwo_6x2_end: +mulAvxTwo_6x1_64_end: RET -// func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_6x2_64(SB), $0-88 - // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 31 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulAvxTwo_6x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), AX - MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 - MOVQ start+72(FP), R10 +TEXT ·mulAvxTwo_6x1Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x1Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 // Add start offset to input ADDQ R10, DX @@ -12076,86 +22690,539 @@ TEXT ·mulAvxTwo_6x2_64(SB), $0-88 ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 - ADDQ R10, AX - MOVQ $0x0000000f, R11 - MOVQ R11, X4 - VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R11 - SHRQ $0x06, R11 + ADDQ R10, CX + MOVQ $0x0000000f, R10 + MOVQ R10, X13 + VPBROADCASTB X13, Y13 -mulAvxTwo_6x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 +mulAvxTwo_6x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VMOVDQU (R9), Y12 + VPSHUFB Y14, Y0, Y14 + VPSHUFB Y15, Y1, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX - VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU (CX), Y5 - VMOVDQU 32(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 - VMOVDQU 64(CX), Y5 - VMOVDQU 96(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y2, Y14 + VPSHUFB Y15, Y3, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 - // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX - VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 128(CX), Y5 - VMOVDQU 160(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 - VMOVDQU 192(CX), Y5 - VMOVDQU 224(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y4, Y14 + VPSHUFB Y15, Y5, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 - // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (SI), Y9 - VMOVDQU 32(SI), Y11 - ADDQ $0x40, SI + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y6, Y14 + VPSHUFB Y15, Y7, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y8, Y14 + VPSHUFB Y15, Y9, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y10, Y14 + VPSHUFB Y15, Y11, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 + + // Store 1 outputs + VMOVDQU Y12, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x1Xor_loop + VZEROUPPER + +mulAvxTwo_6x1Xor_end: + RET + +// func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_6x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_6x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R10), Y0 + VMOVDQU 32(R10), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_6x1_64Xor_end: + RET + +// func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x2(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 31 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x2_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_6x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y5 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x2_loop + VZEROUPPER + +mulAvxTwo_6x2_end: + RET + +// func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x2_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_6x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_6x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -12184,9 +23251,9 @@ mulAvxTwo_6x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs - VMOVDQU (DI), Y9 - VMOVDQU 32(DI), Y11 - ADDQ $0x40, DI + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -12215,9 +23282,9 @@ mulAvxTwo_6x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs - VMOVDQU (R8), Y9 - VMOVDQU 32(R8), Y11 - ADDQ $0x40, R8 + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -12246,9 +23313,9 @@ mulAvxTwo_6x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -12277,33 +23344,32 @@ mulAvxTwo_6x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (R9), R12 - VMOVDQU Y0, (R12)(R10*1) - VMOVDQU Y1, 32(R12)(R10*1) - MOVQ 24(R9), R12 - VMOVDQU Y2, (R12)(R10*1) - VMOVDQU Y3, 32(R12)(R10*1) + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y2, (R10) + VMOVDQU Y3, 32(R10) + ADDQ $0x40, R10 // Prepare for next loop - ADDQ $0x40, R10 - DECQ R11 + DECQ AX JNZ mulAvxTwo_6x2_64_loop VZEROUPPER mulAvxTwo_6x2_64_end: RET -// func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_6x3(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_6x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 44 YMM used + // Full registers estimated 31 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_6x3_end + JZ mulAvxTwo_6x2Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -12313,17 +23379,434 @@ TEXT ·mulAvxTwo_6x3(SB), NOSPLIT, $0-88 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 - MOVQ 24(R10), R12 - MOVQ 48(R10), R10 - MOVQ start+72(FP), R13 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 // Add start offset to output - ADDQ R13, R11 - ADDQ R13, R12 - ADDQ R13, R10 + ADDQ R12, R11 + ADDQ R12, R10 // Add start offset to input - ADDQ R13, BX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_6x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y5 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x2Xor_loop + VZEROUPPER + +mulAvxTwo_6x2Xor_end: + RET + +// func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_6x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_6x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R11), Y0 + VMOVDQU 32(R11), Y1 + VMOVDQU (R10), Y2 + VMOVDQU 32(R10), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y2, (R10) + VMOVDQU Y3, 32(R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_6x2_64Xor_end: + RET + +// func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x3(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x3_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 @@ -12334,11 +23817,6 @@ TEXT ·mulAvxTwo_6x3(SB), NOSPLIT, $0-88 VPBROADCASTB X3, Y3 mulAvxTwo_6x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX @@ -12349,20 +23827,17 @@ mulAvxTwo_6x3_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + VPXOR Y4, Y5, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + VPXOR Y4, Y5, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y4, Y5, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -12509,50 +23984,48 @@ mulAvxTwo_6x3_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x3_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 44 YMM used + // Destination kept in GP registers + // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_6x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), AX - MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 - MOVQ start+72(FP), R10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 // Add start offset to input - ADDQ R10, DX - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, R8 - ADDQ R10, AX - MOVQ $0x0000000f, R11 - MOVQ R11, X6 + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R11 - SHRQ $0x06, R11 mulAvxTwo_6x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -12565,35 +24038,29 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -12632,9 +24099,9 @@ mulAvxTwo_6x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y11 - VMOVDQU 32(SI), Y13 - ADDQ $0x40, SI + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -12673,9 +24140,9 @@ mulAvxTwo_6x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y11 - VMOVDQU 32(DI), Y13 - ADDQ $0x40, DI + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -12714,9 +24181,9 @@ mulAvxTwo_6x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y11 - VMOVDQU 32(R8), Y13 - ADDQ $0x40, R8 + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -12755,9 +24222,9 @@ mulAvxTwo_6x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -12796,36 +24263,35 @@ mulAvxTwo_6x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (R9), R12 - VMOVDQU Y0, (R12)(R10*1) - VMOVDQU Y1, 32(R12)(R10*1) - MOVQ 24(R9), R12 - VMOVDQU Y2, (R12)(R10*1) - VMOVDQU Y3, 32(R12)(R10*1) - MOVQ 48(R9), R12 - VMOVDQU Y4, (R12)(R10*1) - VMOVDQU Y5, 32(R12)(R10*1) - - // Prepare for next loop - ADDQ $0x40, R10 - DECQ R11 - JNZ mulAvxTwo_6x3_64_loop - VZEROUPPER - -mulAvxTwo_6x3_64_end: - RET - -// func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y2, (R12) + VMOVDQU Y3, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y4, (R10) + VMOVDQU Y5, 32(R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x3_64_loop + VZEROUPPER + +mulAvxTwo_6x3_64_end: + RET + +// func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_6x4(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_6x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 57 YMM used + // Full registers estimated 44 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_6x4_end + JZ mulAvxTwo_6x3Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -12836,249 +24302,206 @@ TEXT ·mulAvxTwo_6x4(SB), NOSPLIT, $0-88 MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 - MOVQ 48(R10), R13 - MOVQ 72(R10), R10 - MOVQ start+72(FP), R14 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 // Add start offset to output - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, R13 - ADDQ R14, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X4 - VPBROADCASTB X4, Y4 - -mulAvxTwo_6x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X3 + VPBROADCASTB X3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs - VMOVDQU (BX), Y7 +mulAvxTwo_6x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 ADDQ $0x20, BX - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU (CX), Y5 - VMOVDQU 32(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 64(CX), Y5 - VMOVDQU 96(CX), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 128(CX), Y5 - VMOVDQU 160(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 192(CX), Y5 - VMOVDQU 224(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 1 to 4 outputs - VMOVDQU (SI), Y7 + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 ADDQ $0x20, SI - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 256(CX), Y5 - VMOVDQU 288(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 320(CX), Y5 - VMOVDQU 352(CX), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 384(CX), Y5 - VMOVDQU 416(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 448(CX), Y5 - VMOVDQU 480(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 2 to 4 outputs - VMOVDQU (DI), Y7 + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 ADDQ $0x20, DI - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 512(CX), Y5 - VMOVDQU 544(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 576(CX), Y5 - VMOVDQU 608(CX), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 640(CX), Y5 - VMOVDQU 672(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 704(CX), Y5 - VMOVDQU 736(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 3 to 4 outputs - VMOVDQU (R8), Y7 + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y6 ADDQ $0x20, R8 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 768(CX), Y5 - VMOVDQU 800(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 832(CX), Y5 - VMOVDQU 864(CX), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 896(CX), Y5 - VMOVDQU 928(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 960(CX), Y5 - VMOVDQU 992(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 4 to 4 outputs - VMOVDQU (R9), Y7 + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y6 ADDQ $0x20, R9 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1024(CX), Y5 - VMOVDQU 1056(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1088(CX), Y5 - VMOVDQU 1120(CX), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1152(CX), Y5 - VMOVDQU 1184(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 1216(CX), Y5 - VMOVDQU 1248(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 5 to 4 outputs - VMOVDQU (DX), Y7 + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y6 ADDQ $0x20, DX - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1280(CX), Y5 - VMOVDQU 1312(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1344(CX), Y5 - VMOVDQU 1376(CX), Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1408(CX), Y5 - VMOVDQU 1440(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 1472(CX), Y5 - VMOVDQU 1504(CX), Y6 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Store 4 outputs + // Store 3 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 - VMOVDQU Y2, (R13) - ADDQ $0x20, R13 - VMOVDQU Y3, (R10) + VMOVDQU Y2, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_6x4_loop + JNZ mulAvxTwo_6x3Xor_loop VZEROUPPER -mulAvxTwo_6x4_end: +mulAvxTwo_6x3Xor_end: RET -// func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_6x5(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_6x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 70 YMM used + // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX + SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_6x5_end + JZ mulAvxTwo_6x3_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -13087,1005 +24510,807 @@ TEXT ·mulAvxTwo_6x5(SB), NOSPLIT, $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 - MOVQ 48(R10), R13 - MOVQ 72(R10), R14 - MOVQ 96(R10), R10 - MOVQ start+72(FP), R15 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 // Add start offset to output - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, R13 - ADDQ R15, R14 - ADDQ R15, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X5 - VPBROADCASTB X5, Y5 - -mulAvxTwo_6x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X6 + VPBROADCASTB X6, Y6 - // Load and process 32 bytes from input 0 to 5 outputs - VMOVDQU (BX), Y8 - ADDQ $0x20, BX - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU (CX), Y6 - VMOVDQU 32(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 64(CX), Y6 - VMOVDQU 96(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 128(CX), Y6 - VMOVDQU 160(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 256(CX), Y6 - VMOVDQU 288(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +mulAvxTwo_6x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R11), Y0 + VMOVDQU 32(R11), Y1 + VMOVDQU (R12), Y2 + VMOVDQU 32(R12), Y3 + VMOVDQU (R10), Y4 + VMOVDQU 32(R10), Y5 - // Load and process 32 bytes from input 1 to 5 outputs - VMOVDQU (SI), Y8 - ADDQ $0x20, SI - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 320(CX), Y6 - VMOVDQU 352(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 384(CX), Y6 - VMOVDQU 416(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 448(CX), Y6 - VMOVDQU 480(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 512(CX), Y6 - VMOVDQU 544(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 576(CX), Y6 - VMOVDQU 608(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 2 to 5 outputs - VMOVDQU (DI), Y8 - ADDQ $0x20, DI - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 640(CX), Y6 - VMOVDQU 672(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 704(CX), Y6 - VMOVDQU 736(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 768(CX), Y6 - VMOVDQU 800(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 832(CX), Y6 - VMOVDQU 864(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 896(CX), Y6 - VMOVDQU 928(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 - - // Load and process 32 bytes from input 3 to 5 outputs - VMOVDQU (R8), Y8 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 960(CX), Y6 - VMOVDQU 992(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1024(CX), Y6 - VMOVDQU 1056(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1088(CX), Y6 - VMOVDQU 1120(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1152(CX), Y6 - VMOVDQU 1184(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1216(CX), Y6 - VMOVDQU 1248(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 - - // Load and process 32 bytes from input 4 to 5 outputs - VMOVDQU (R9), Y8 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 1280(CX), Y6 - VMOVDQU 1312(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1344(CX), Y6 - VMOVDQU 1376(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1408(CX), Y6 - VMOVDQU 1440(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1472(CX), Y6 - VMOVDQU 1504(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1536(CX), Y6 - VMOVDQU 1568(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 - - // Load and process 32 bytes from input 5 to 5 outputs - VMOVDQU (DX), Y8 - ADDQ $0x20, DX - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 1600(CX), Y6 - VMOVDQU 1632(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1664(CX), Y6 - VMOVDQU 1696(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1728(CX), Y6 - VMOVDQU 1760(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1792(CX), Y6 - VMOVDQU 1824(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1856(CX), Y6 - VMOVDQU 1888(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 - - // Store 5 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - VMOVDQU Y1, (R12) - ADDQ $0x20, R12 - VMOVDQU Y2, (R13) - ADDQ $0x20, R13 - VMOVDQU Y3, (R14) - ADDQ $0x20, R14 - VMOVDQU Y4, (R10) - ADDQ $0x20, R10 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_6x5_loop - VZEROUPPER - -mulAvxTwo_6x5_end: - RET - -// func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_6x6(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 83 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x6_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), DX - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R12 - MOVQ 48(R10), R13 - MOVQ 72(R10), R14 - MOVQ 96(R10), R15 - MOVQ 120(R10), R10 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, R13 - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R10 - - // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X6 - VPBROADCASTB X6, Y6 - -mulAvxTwo_6x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - - // Load and process 32 bytes from input 0 to 6 outputs - VMOVDQU (BX), Y9 - ADDQ $0x20, BX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU (CX), Y7 - VMOVDQU 32(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 64(CX), Y7 - VMOVDQU 96(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 128(CX), Y7 - VMOVDQU 160(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 1 to 6 outputs - VMOVDQU (SI), Y9 - ADDQ $0x20, SI - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 2 to 6 outputs - VMOVDQU (DI), Y9 - ADDQ $0x20, DI - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 - - // Load and process 32 bytes from input 3 to 6 outputs - VMOVDQU (R8), Y9 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 1152(CX), Y7 - VMOVDQU 1184(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 1216(CX), Y7 - VMOVDQU 1248(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 1280(CX), Y7 - VMOVDQU 1312(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 1344(CX), Y7 - VMOVDQU 1376(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 1408(CX), Y7 - VMOVDQU 1440(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 - VMOVDQU 1472(CX), Y7 - VMOVDQU 1504(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 4 to 6 outputs - VMOVDQU (R9), Y9 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 1536(CX), Y7 - VMOVDQU 1568(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 1600(CX), Y7 - VMOVDQU 1632(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 1664(CX), Y7 - VMOVDQU 1696(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 1728(CX), Y7 - VMOVDQU 1760(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 1792(CX), Y7 - VMOVDQU 1824(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 1856(CX), Y7 - VMOVDQU 1888(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + // Store 3 outputs + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y2, (R12) + VMOVDQU Y3, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y4, (R10) + VMOVDQU Y5, 32(R10) + ADDQ $0x40, R10 - // Load and process 32 bytes from input 5 to 6 outputs - VMOVDQU (DX), Y9 + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_6x3_64Xor_end: + RET + +// func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x4(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x4_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_6x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y7 ADDQ $0x20, DX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 1920(CX), Y7 - VMOVDQU 1952(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 1984(CX), Y7 - VMOVDQU 2016(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 2048(CX), Y7 - VMOVDQU 2080(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 2112(CX), Y7 - VMOVDQU 2144(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 2176(CX), Y7 - VMOVDQU 2208(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 2240(CX), Y7 - VMOVDQU 2272(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Store 6 outputs + // Store 4 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 VMOVDQU Y2, (R13) ADDQ $0x20, R13 - VMOVDQU Y3, (R14) - ADDQ $0x20, R14 - VMOVDQU Y4, (R15) - ADDQ $0x20, R15 - VMOVDQU Y5, (R10) + VMOVDQU Y3, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_6x6_loop + JNZ mulAvxTwo_6x4_loop VZEROUPPER -mulAvxTwo_6x6_end: +mulAvxTwo_6x4_end: RET -// func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_6x7(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_6x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 96 YMM used + // Full registers estimated 57 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_6x7_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), AX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R10 - MOVQ 24(R9), R11 - MOVQ 48(R9), R12 - MOVQ 72(R9), R13 - MOVQ 96(R9), R14 - MOVQ 120(R9), R15 - MOVQ 144(R9), R9 - MOVQ start+72(FP), BP + JZ mulAvxTwo_6x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 // Add start offset to output - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, R13 - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R9 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 // Add start offset to input - ADDQ BP, DX - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, AX - MOVQ $0x0000000f, BP - MOVQ BP, X7 - VPBROADCASTB X7, Y7 - MOVQ n+80(FP), BP - SHRQ $0x05, BP - -mulAvxTwo_6x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - - // Load and process 32 bytes from input 0 to 7 outputs - VMOVDQU (DX), Y10 - ADDQ $0x20, DX - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU (CX), Y8 - VMOVDQU 32(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 64(CX), Y8 - VMOVDQU 96(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 128(CX), Y8 - VMOVDQU 160(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 192(CX), Y8 - VMOVDQU 224(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 320(CX), Y8 - VMOVDQU 352(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 384(CX), Y8 - VMOVDQU 416(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X4 + VPBROADCASTB X4, Y4 - // Load and process 32 bytes from input 1 to 7 outputs - VMOVDQU (BX), Y10 +mulAvxTwo_6x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 ADDQ $0x20, BX - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 448(CX), Y8 - VMOVDQU 480(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 512(CX), Y8 - VMOVDQU 544(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 576(CX), Y8 - VMOVDQU 608(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 640(CX), Y8 - VMOVDQU 672(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 704(CX), Y8 - VMOVDQU 736(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 768(CX), Y8 - VMOVDQU 800(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 832(CX), Y8 - VMOVDQU 864(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 2 to 7 outputs - VMOVDQU (SI), Y10 + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 ADDQ $0x20, SI - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 896(CX), Y8 - VMOVDQU 928(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 960(CX), Y8 - VMOVDQU 992(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1024(CX), Y8 - VMOVDQU 1056(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1088(CX), Y8 - VMOVDQU 1120(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 1152(CX), Y8 - VMOVDQU 1184(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 1216(CX), Y8 - VMOVDQU 1248(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 1280(CX), Y8 - VMOVDQU 1312(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 3 to 7 outputs - VMOVDQU (DI), Y10 + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 ADDQ $0x20, DI - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 1344(CX), Y8 - VMOVDQU 1376(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 1408(CX), Y8 - VMOVDQU 1440(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1472(CX), Y8 - VMOVDQU 1504(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1536(CX), Y8 - VMOVDQU 1568(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 1600(CX), Y8 - VMOVDQU 1632(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 1664(CX), Y8 - VMOVDQU 1696(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 1728(CX), Y8 - VMOVDQU 1760(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 4 to 7 outputs - VMOVDQU (R8), Y10 + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 ADDQ $0x20, R8 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 1792(CX), Y8 - VMOVDQU 1824(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 1856(CX), Y8 - VMOVDQU 1888(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1920(CX), Y8 - VMOVDQU 1952(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1984(CX), Y8 - VMOVDQU 2016(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 2048(CX), Y8 - VMOVDQU 2080(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 2112(CX), Y8 - VMOVDQU 2144(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 2176(CX), Y8 - VMOVDQU 2208(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 5 to 7 outputs - VMOVDQU (AX), Y10 - ADDQ $0x20, AX - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 2240(CX), Y8 - VMOVDQU 2272(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 2304(CX), Y8 - VMOVDQU 2336(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 2368(CX), Y8 - VMOVDQU 2400(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 2432(CX), Y8 - VMOVDQU 2464(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 2496(CX), Y8 - VMOVDQU 2528(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 2560(CX), Y8 - VMOVDQU 2592(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 2624(CX), Y8 - VMOVDQU 2656(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Store 7 outputs - VMOVDQU Y0, (R10) - ADDQ $0x20, R10 - VMOVDQU Y1, (R11) + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R11) ADDQ $0x20, R11 - VMOVDQU Y2, (R12) + VMOVDQU Y1, (R12) ADDQ $0x20, R12 - VMOVDQU Y3, (R13) + VMOVDQU Y2, (R13) ADDQ $0x20, R13 - VMOVDQU Y4, (R14) - ADDQ $0x20, R14 - VMOVDQU Y5, (R15) - ADDQ $0x20, R15 - VMOVDQU Y6, (R9) - ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 // Prepare for next loop - DECQ BP - JNZ mulAvxTwo_6x7_loop + DECQ AX + JNZ mulAvxTwo_6x4Xor_loop VZEROUPPER -mulAvxTwo_6x7_end: +mulAvxTwo_6x4Xor_end: RET -// func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_6x8(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_6x5(SB), NOSPLIT, $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 109 YMM used + // Destination kept in GP registers + // Full registers estimated 70 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_6x8_end + JZ mulAvxTwo_6x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -14094,398 +25319,280 @@ TEXT ·mulAvxTwo_6x8(SB), NOSPLIT, $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ start+72(FP), R11 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 - // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, R9 - ADDQ R11, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X8 - VPBROADCASTB X8, Y8 + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 -mulAvxTwo_6x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X5 + VPBROADCASTB X5, Y5 - // Load and process 32 bytes from input 0 to 8 outputs - VMOVDQU (BX), Y11 +mulAvxTwo_6x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 ADDQ $0x20, BX - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU (CX), Y9 - VMOVDQU 32(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 64(CX), Y9 - VMOVDQU 96(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 128(CX), Y9 - VMOVDQU 160(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 192(CX), Y9 - VMOVDQU 224(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 256(CX), Y9 - VMOVDQU 288(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 320(CX), Y9 - VMOVDQU 352(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 384(CX), Y9 - VMOVDQU 416(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 448(CX), Y9 - VMOVDQU 480(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 - // Load and process 32 bytes from input 1 to 8 outputs - VMOVDQU (SI), Y11 + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 ADDQ $0x20, SI - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 512(CX), Y9 - VMOVDQU 544(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 576(CX), Y9 - VMOVDQU 608(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 640(CX), Y9 - VMOVDQU 672(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 704(CX), Y9 - VMOVDQU 736(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 768(CX), Y9 - VMOVDQU 800(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 832(CX), Y9 - VMOVDQU 864(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 896(CX), Y9 - VMOVDQU 928(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 960(CX), Y9 - VMOVDQU 992(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - - // Load and process 32 bytes from input 2 to 8 outputs - VMOVDQU (DI), Y11 - ADDQ $0x20, DI - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 1024(CX), Y9 - VMOVDQU 1056(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 1088(CX), Y9 - VMOVDQU 1120(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 1152(CX), Y9 - VMOVDQU 1184(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 1216(CX), Y9 - VMOVDQU 1248(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 1280(CX), Y9 - VMOVDQU 1312(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 1344(CX), Y9 - VMOVDQU 1376(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 1408(CX), Y9 - VMOVDQU 1440(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 1472(CX), Y9 - VMOVDQU 1504(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 3 to 8 outputs - VMOVDQU (R8), Y11 + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 ADDQ $0x20, R8 - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 1536(CX), Y9 - VMOVDQU 1568(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 1600(CX), Y9 - VMOVDQU 1632(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 1664(CX), Y9 - VMOVDQU 1696(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 1728(CX), Y9 - VMOVDQU 1760(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 1792(CX), Y9 - VMOVDQU 1824(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 1856(CX), Y9 - VMOVDQU 1888(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 1920(CX), Y9 - VMOVDQU 1952(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 1984(CX), Y9 - VMOVDQU 2016(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 4 to 8 outputs - VMOVDQU (R9), Y11 + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 ADDQ $0x20, R9 - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 2048(CX), Y9 - VMOVDQU 2080(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 2112(CX), Y9 - VMOVDQU 2144(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 2176(CX), Y9 - VMOVDQU 2208(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 2240(CX), Y9 - VMOVDQU 2272(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 2304(CX), Y9 - VMOVDQU 2336(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 2368(CX), Y9 - VMOVDQU 2400(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 2432(CX), Y9 - VMOVDQU 2464(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 2496(CX), Y9 - VMOVDQU 2528(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 5 to 8 outputs - VMOVDQU (DX), Y11 + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y8 ADDQ $0x20, DX - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 2560(CX), Y9 - VMOVDQU 2592(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 2624(CX), Y9 - VMOVDQU 2656(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 2688(CX), Y9 - VMOVDQU 2720(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 2752(CX), Y9 - VMOVDQU 2784(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 2816(CX), Y9 - VMOVDQU 2848(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 2880(CX), Y9 - VMOVDQU 2912(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 2944(CX), Y9 - VMOVDQU 2976(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 3008(CX), Y9 - VMOVDQU 3040(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Store 8 outputs - MOVQ (R10), R12 - VMOVDQU Y0, (R12)(R11*1) - MOVQ 24(R10), R12 - VMOVDQU Y1, (R12)(R11*1) - MOVQ 48(R10), R12 - VMOVDQU Y2, (R12)(R11*1) - MOVQ 72(R10), R12 - VMOVDQU Y3, (R12)(R11*1) - MOVQ 96(R10), R12 - VMOVDQU Y4, (R12)(R11*1) - MOVQ 120(R10), R12 - VMOVDQU Y5, (R12)(R11*1) - MOVQ 144(R10), R12 - VMOVDQU Y6, (R12)(R11*1) - MOVQ 168(R10), R12 - VMOVDQU Y7, (R12)(R11*1) + // Store 5 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 // Prepare for next loop - ADDQ $0x20, R11 DECQ AX - JNZ mulAvxTwo_6x8_loop + JNZ mulAvxTwo_6x5_loop VZEROUPPER -mulAvxTwo_6x8_end: +mulAvxTwo_6x5_end: RET -// func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_6x9(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_6x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 122 YMM used + // Destination kept in GP registers + // Full registers estimated 70 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_6x9_end + JZ mulAvxTwo_6x5Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -14494,437 +25601,290 @@ TEXT ·mulAvxTwo_6x9(SB), NOSPLIT, $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ start+72(FP), R11 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 - // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, R9 - ADDQ R11, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X9 - VPBROADCASTB X9, Y9 + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 -mulAvxTwo_6x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X5 + VPBROADCASTB X5, Y5 - // Load and process 32 bytes from input 0 to 9 outputs - VMOVDQU (BX), Y12 +mulAvxTwo_6x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 ADDQ $0x20, BX - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU (CX), Y10 - VMOVDQU 32(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 64(CX), Y10 - VMOVDQU 96(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 128(CX), Y10 - VMOVDQU 160(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 192(CX), Y10 - VMOVDQU 224(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 256(CX), Y10 - VMOVDQU 288(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 320(CX), Y10 - VMOVDQU 352(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 384(CX), Y10 - VMOVDQU 416(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 448(CX), Y10 - VMOVDQU 480(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 512(CX), Y10 - VMOVDQU 544(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU (R14), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU (R10), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 1 to 9 outputs - VMOVDQU (SI), Y12 + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 ADDQ $0x20, SI - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 576(CX), Y10 - VMOVDQU 608(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 640(CX), Y10 - VMOVDQU 672(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 704(CX), Y10 - VMOVDQU 736(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 768(CX), Y10 - VMOVDQU 800(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 832(CX), Y10 - VMOVDQU 864(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 896(CX), Y10 - VMOVDQU 928(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 960(CX), Y10 - VMOVDQU 992(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 1024(CX), Y10 - VMOVDQU 1056(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 1088(CX), Y10 - VMOVDQU 1120(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 2 to 9 outputs - VMOVDQU (DI), Y12 + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 ADDQ $0x20, DI - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 1152(CX), Y10 - VMOVDQU 1184(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 1216(CX), Y10 - VMOVDQU 1248(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 1280(CX), Y10 - VMOVDQU 1312(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 1344(CX), Y10 - VMOVDQU 1376(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 1408(CX), Y10 - VMOVDQU 1440(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 1472(CX), Y10 - VMOVDQU 1504(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 1536(CX), Y10 - VMOVDQU 1568(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 1600(CX), Y10 - VMOVDQU 1632(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 1664(CX), Y10 - VMOVDQU 1696(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 3 to 9 outputs - VMOVDQU (R8), Y12 + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 ADDQ $0x20, R8 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 1728(CX), Y10 - VMOVDQU 1760(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 1792(CX), Y10 - VMOVDQU 1824(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 1856(CX), Y10 - VMOVDQU 1888(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 1920(CX), Y10 - VMOVDQU 1952(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 1984(CX), Y10 - VMOVDQU 2016(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 2048(CX), Y10 - VMOVDQU 2080(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 2112(CX), Y10 - VMOVDQU 2144(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 2176(CX), Y10 - VMOVDQU 2208(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 2240(CX), Y10 - VMOVDQU 2272(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 4 to 9 outputs - VMOVDQU (R9), Y12 + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 ADDQ $0x20, R9 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 2304(CX), Y10 - VMOVDQU 2336(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 2368(CX), Y10 - VMOVDQU 2400(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 2432(CX), Y10 - VMOVDQU 2464(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 2496(CX), Y10 - VMOVDQU 2528(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 2560(CX), Y10 - VMOVDQU 2592(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 2624(CX), Y10 - VMOVDQU 2656(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 2688(CX), Y10 - VMOVDQU 2720(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 2752(CX), Y10 - VMOVDQU 2784(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 2816(CX), Y10 - VMOVDQU 2848(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 5 to 9 outputs - VMOVDQU (DX), Y12 + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y8 ADDQ $0x20, DX - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 2880(CX), Y10 - VMOVDQU 2912(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 2944(CX), Y10 - VMOVDQU 2976(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 3008(CX), Y10 - VMOVDQU 3040(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 3072(CX), Y10 - VMOVDQU 3104(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 3136(CX), Y10 - VMOVDQU 3168(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 3200(CX), Y10 - VMOVDQU 3232(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 3264(CX), Y10 - VMOVDQU 3296(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 3328(CX), Y10 - VMOVDQU 3360(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 3392(CX), Y10 - VMOVDQU 3424(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Store 9 outputs - MOVQ (R10), R12 - VMOVDQU Y0, (R12)(R11*1) - MOVQ 24(R10), R12 - VMOVDQU Y1, (R12)(R11*1) - MOVQ 48(R10), R12 - VMOVDQU Y2, (R12)(R11*1) - MOVQ 72(R10), R12 - VMOVDQU Y3, (R12)(R11*1) - MOVQ 96(R10), R12 - VMOVDQU Y4, (R12)(R11*1) - MOVQ 120(R10), R12 - VMOVDQU Y5, (R12)(R11*1) - MOVQ 144(R10), R12 - VMOVDQU Y6, (R12)(R11*1) - MOVQ 168(R10), R12 - VMOVDQU Y7, (R12)(R11*1) - MOVQ 192(R10), R12 - VMOVDQU Y8, (R12)(R11*1) + // Store 5 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 // Prepare for next loop - ADDQ $0x20, R11 DECQ AX - JNZ mulAvxTwo_6x9_loop + JNZ mulAvxTwo_6x5Xor_loop VZEROUPPER -mulAvxTwo_6x9_end: +mulAvxTwo_6x5Xor_end: RET -// func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_6x10(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_6x6(SB), NOSPLIT, $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 135 YMM used + // Destination kept in GP registers + // Full registers estimated 83 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_6x10_end + JZ mulAvxTwo_6x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -14933,3299 +25893,24330 @@ TEXT ·mulAvxTwo_6x10(SB), NOSPLIT, $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ start+72(FP), R11 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP - // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, R9 - ADDQ R11, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X10 - VPBROADCASTB X10, Y10 + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 -mulAvxTwo_6x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 - // Load and process 32 bytes from input 0 to 10 outputs - VMOVDQU (BX), Y13 +mulAvxTwo_6x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 ADDQ $0x20, BX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU (CX), Y11 - VMOVDQU 32(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 64(CX), Y11 - VMOVDQU 96(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 128(CX), Y11 - VMOVDQU 160(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 192(CX), Y11 - VMOVDQU 224(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 256(CX), Y11 - VMOVDQU 288(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 320(CX), Y11 - VMOVDQU 352(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 384(CX), Y11 - VMOVDQU 416(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 448(CX), Y11 - VMOVDQU 480(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 512(CX), Y11 - VMOVDQU 544(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 576(CX), Y11 - VMOVDQU 608(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 - // Load and process 32 bytes from input 1 to 10 outputs - VMOVDQU (SI), Y13 + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 ADDQ $0x20, SI - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 640(CX), Y11 - VMOVDQU 672(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 704(CX), Y11 - VMOVDQU 736(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 768(CX), Y11 - VMOVDQU 800(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 832(CX), Y11 - VMOVDQU 864(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 896(CX), Y11 - VMOVDQU 928(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 960(CX), Y11 - VMOVDQU 992(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 1024(CX), Y11 - VMOVDQU 1056(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 1088(CX), Y11 - VMOVDQU 1120(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 1152(CX), Y11 - VMOVDQU 1184(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 1216(CX), Y11 - VMOVDQU 1248(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 32 bytes from input 2 to 10 outputs - VMOVDQU (DI), Y13 + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 ADDQ $0x20, DI - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 1280(CX), Y11 - VMOVDQU 1312(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 1344(CX), Y11 - VMOVDQU 1376(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 1408(CX), Y11 - VMOVDQU 1440(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 1472(CX), Y11 - VMOVDQU 1504(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 1536(CX), Y11 - VMOVDQU 1568(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 1600(CX), Y11 - VMOVDQU 1632(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 1664(CX), Y11 - VMOVDQU 1696(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 1728(CX), Y11 - VMOVDQU 1760(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 1792(CX), Y11 - VMOVDQU 1824(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 1856(CX), Y11 - VMOVDQU 1888(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 32 bytes from input 3 to 10 outputs - VMOVDQU (R8), Y13 + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 ADDQ $0x20, R8 - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 1920(CX), Y11 - VMOVDQU 1952(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 1984(CX), Y11 - VMOVDQU 2016(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 2048(CX), Y11 - VMOVDQU 2080(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 2112(CX), Y11 - VMOVDQU 2144(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 2176(CX), Y11 - VMOVDQU 2208(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 2240(CX), Y11 - VMOVDQU 2272(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 2304(CX), Y11 - VMOVDQU 2336(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 2368(CX), Y11 - VMOVDQU 2400(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 2432(CX), Y11 - VMOVDQU 2464(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 2496(CX), Y11 - VMOVDQU 2528(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 32 bytes from input 4 to 10 outputs - VMOVDQU (R9), Y13 + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 ADDQ $0x20, R9 - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 2560(CX), Y11 - VMOVDQU 2592(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 2624(CX), Y11 - VMOVDQU 2656(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 2688(CX), Y11 - VMOVDQU 2720(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 2752(CX), Y11 - VMOVDQU 2784(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 2816(CX), Y11 - VMOVDQU 2848(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 2880(CX), Y11 - VMOVDQU 2912(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 2944(CX), Y11 - VMOVDQU 2976(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 3008(CX), Y11 - VMOVDQU 3040(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 3072(CX), Y11 - VMOVDQU 3104(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 3136(CX), Y11 - VMOVDQU 3168(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 32 bytes from input 5 to 10 outputs - VMOVDQU (DX), Y13 + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y9 ADDQ $0x20, DX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 3200(CX), Y11 - VMOVDQU 3232(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 3264(CX), Y11 - VMOVDQU 3296(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 3328(CX), Y11 - VMOVDQU 3360(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 3392(CX), Y11 - VMOVDQU 3424(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 3456(CX), Y11 - VMOVDQU 3488(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 3520(CX), Y11 - VMOVDQU 3552(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 3584(CX), Y11 - VMOVDQU 3616(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 3648(CX), Y11 - VMOVDQU 3680(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 3712(CX), Y11 - VMOVDQU 3744(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 3776(CX), Y11 - VMOVDQU 3808(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Store 10 outputs - MOVQ (R10), R12 - VMOVDQU Y0, (R12)(R11*1) - MOVQ 24(R10), R12 - VMOVDQU Y1, (R12)(R11*1) - MOVQ 48(R10), R12 - VMOVDQU Y2, (R12)(R11*1) - MOVQ 72(R10), R12 - VMOVDQU Y3, (R12)(R11*1) - MOVQ 96(R10), R12 - VMOVDQU Y4, (R12)(R11*1) - MOVQ 120(R10), R12 - VMOVDQU Y5, (R12)(R11*1) - MOVQ 144(R10), R12 - VMOVDQU Y6, (R12)(R11*1) - MOVQ 168(R10), R12 - VMOVDQU Y7, (R12)(R11*1) - MOVQ 192(R10), R12 - VMOVDQU Y8, (R12)(R11*1) - MOVQ 216(R10), R12 - VMOVDQU Y9, (R12)(R11*1) + // Store 6 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R15) + ADDQ $0x20, R15 + VMOVDQU Y5, (R10) + ADDQ $0x20, R10 // Prepare for next loop - ADDQ $0x20, R11 DECQ AX - JNZ mulAvxTwo_6x10_loop + JNZ mulAvxTwo_6x6_loop VZEROUPPER -mulAvxTwo_6x10_end: +mulAvxTwo_6x6_end: RET -// func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_7x1(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_6x6Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 18 YMM used + // Full registers estimated 83 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_7x1_end + JZ mulAvxTwo_6x6Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R11 - MOVQ start+72(FP), R12 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP // Add start offset to output - ADDQ R12, R11 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_7x1_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 +mulAvxTwo_6x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU (R14), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU (R15), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU (R10), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Store 1 outputs + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R15) + ADDQ $0x20, R15 + VMOVDQU Y5, (R10) + ADDQ $0x20, R10 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_7x1_loop + JNZ mulAvxTwo_6x6Xor_loop VZEROUPPER -mulAvxTwo_7x1_end: +mulAvxTwo_6x6Xor_end: RET -// func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_7x1_64(SB), $0-88 +TEXT ·mulAvxTwo_6x7(SB), NOSPLIT, $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 18 YMM used + // Destination kept in GP registers + // Full registers estimated 96 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX + SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_7x1_64_end + JZ mulAvxTwo_6x7_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), AX - MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 - MOVQ start+72(FP), R11 - - // Add start offset to input - ADDQ R11, DX - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, R9 - ADDQ R11, AX - MOVQ $0x0000000f, R12 - MOVQ R12, X2 - VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R12 - SHRQ $0x06, R12 - -mulAvxTwo_7x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y6 - VMOVDQU 32(SI), Y5 - ADDQ $0x40, SI - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y6 - VMOVDQU 32(DI), Y5 - ADDQ $0x40, DI - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y6 - VMOVDQU 32(R8), Y5 - ADDQ $0x40, R8 - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 5 to 1 outputs - VMOVDQU (R9), Y6 - VMOVDQU 32(R9), Y5 - ADDQ $0x40, R9 - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 6 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Store 1 outputs - MOVQ (R10), R13 - VMOVDQU Y0, (R13)(R11*1) - VMOVDQU Y1, 32(R13)(R11*1) - - // Prepare for next loop - ADDQ $0x40, R11 - DECQ R12 - JNZ mulAvxTwo_7x1_64_loop - VZEROUPPER - -mulAvxTwo_7x1_64_end: - RET - -// func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_7x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 35 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R11 - MOVQ start+72(FP), R13 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP // Add start offset to output - ADDQ R13, R12 - ADDQ R13, R11 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X2 - VPBROADCASTB X2, Y2 + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X7 + VPBROADCASTB X7, Y7 + MOVQ n+80(FP), BP + SHRQ $0x05, BP -mulAvxTwo_7x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 +mulAvxTwo_6x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y10 ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y10 ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Store 2 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y10 + ADDQ $0x20, AX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R14) + ADDQ $0x20, R14 + VMOVDQU Y5, (R15) + ADDQ $0x20, R15 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x2_loop + DECQ BP + JNZ mulAvxTwo_6x7_loop VZEROUPPER -mulAvxTwo_7x2_end: +mulAvxTwo_6x7_end: RET -// func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_7x2_64(SB), $0-88 +TEXT ·mulAvxTwo_6x7Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 35 YMM used + // Destination kept in GP registers + // Full registers estimated 96 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX + SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_7x2_64_end + JZ mulAvxTwo_6x7Xor_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), AX - MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 - MOVQ start+72(FP), R11 - - // Add start offset to input - ADDQ R11, DX - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, R9 - ADDQ R11, AX - MOVQ $0x0000000f, R12 - MOVQ R12, X4 - VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R12 - SHRQ $0x06, R12 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP -mulAvxTwo_7x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX - VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU (CX), Y5 - VMOVDQU 32(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 - VMOVDQU 64(CX), Y5 - VMOVDQU 96(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X7 + VPBROADCASTB X7, Y7 + MOVQ n+80(FP), BP + SHRQ $0x05, BP - // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX - VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 128(CX), Y5 - VMOVDQU 160(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 - VMOVDQU 192(CX), Y5 - VMOVDQU 224(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +mulAvxTwo_6x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU (R13), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU (R14), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU (R15), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU (R9), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (SI), Y9 - VMOVDQU 32(SI), Y11 - ADDQ $0x40, SI - VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 256(CX), Y5 - VMOVDQU 288(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 - VMOVDQU 320(CX), Y5 - VMOVDQU 352(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 64 bytes from input 3 to 2 outputs - VMOVDQU (DI), Y9 - VMOVDQU 32(DI), Y11 - ADDQ $0x40, DI - VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 384(CX), Y5 - VMOVDQU 416(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 - VMOVDQU 448(CX), Y5 - VMOVDQU 480(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 64 bytes from input 4 to 2 outputs - VMOVDQU (R8), Y9 - VMOVDQU 32(R8), Y11 - ADDQ $0x40, R8 - VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 512(CX), Y5 - VMOVDQU 544(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 - VMOVDQU 576(CX), Y5 - VMOVDQU 608(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 64 bytes from input 5 to 2 outputs - VMOVDQU (R9), Y9 - VMOVDQU 32(R9), Y11 - ADDQ $0x40, R9 - VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 640(CX), Y5 - VMOVDQU 672(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 - VMOVDQU 704(CX), Y5 - VMOVDQU 736(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 64 bytes from input 6 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX - VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 768(CX), Y5 - VMOVDQU 800(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 - VMOVDQU 832(CX), Y5 - VMOVDQU 864(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y10 + ADDQ $0x20, AX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Store 2 outputs - MOVQ (R10), R13 - VMOVDQU Y0, (R13)(R11*1) - VMOVDQU Y1, 32(R13)(R11*1) - MOVQ 24(R10), R13 - VMOVDQU Y2, (R13)(R11*1) - VMOVDQU Y3, 32(R13)(R11*1) + // Store 7 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R14) + ADDQ $0x20, R14 + VMOVDQU Y5, (R15) + ADDQ $0x20, R15 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 // Prepare for next loop - ADDQ $0x40, R11 - DECQ R12 - JNZ mulAvxTwo_7x2_64_loop + DECQ BP + JNZ mulAvxTwo_6x7Xor_loop VZEROUPPER -mulAvxTwo_7x2_64_end: +mulAvxTwo_6x7Xor_end: RET -// func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_7x3(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_6x8(SB), NOSPLIT, $0-88 // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 50 YMM used + // Destination kept on stack + // Full registers estimated 109 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_7x3_end + JZ mulAvxTwo_6x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R13 - MOVQ 48(R11), R11 - MOVQ start+72(FP), R14 - - // Add start offset to output - ADDQ R14, R12 - ADDQ R14, R13 - ADDQ R14, R11 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_7x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X8 + VPBROADCASTB X8, Y8 - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 +mulAvxTwo_6x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (DX), Y6 + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y11 ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Store 3 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - VMOVDQU Y1, (R13) - ADDQ $0x20, R13 - VMOVDQU Y2, (R11) - ADDQ $0x20, R11 + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) // Prepare for next loop + ADDQ $0x20, R11 DECQ AX - JNZ mulAvxTwo_7x3_loop + JNZ mulAvxTwo_6x8_loop VZEROUPPER -mulAvxTwo_7x3_end: +mulAvxTwo_6x8_end: RET -// func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_7x3_64(SB), $0-88 +TEXT ·mulAvxTwo_6x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 50 YMM used + // Full registers estimated 109 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX + SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_7x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), AX - MOVQ out_base+48(FP), R10 + JZ mulAvxTwo_6x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input - ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 - ADDQ R11, AX + ADDQ R11, DX MOVQ $0x0000000f, R12 - MOVQ R12, X6 - VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R12 - SHRQ $0x06, R12 - -mulAvxTwo_7x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 + MOVQ R12, X8 + VPBROADCASTB X8, Y8 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX +mulAvxTwo_6x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU (CX), Y7 - VMOVDQU 32(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 64(CX), Y7 - VMOVDQU 96(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 128(CX), Y7 - VMOVDQU 160(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU 192(CX), Y7 - VMOVDQU 224(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 256(CX), Y7 - VMOVDQU 288(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 320(CX), Y7 - VMOVDQU 352(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 - - // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y11 - VMOVDQU 32(SI), Y13 - ADDQ $0x40, SI - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU 384(CX), Y7 - VMOVDQU 416(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 - VMOVDQU 448(CX), Y7 - VMOVDQU 480(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 - VMOVDQU 512(CX), Y7 - VMOVDQU 544(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y9, Y7, Y7 - // Load and process 64 bytes from input 3 to 3 outputs + // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 - VMOVDQU 32(DI), Y13 - ADDQ $0x40, DI + ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU 576(CX), Y7 - VMOVDQU 608(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 640(CX), Y7 - VMOVDQU 672(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 704(CX), Y7 - VMOVDQU 736(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 64 bytes from input 4 to 3 outputs + // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 - VMOVDQU 32(R8), Y13 - ADDQ $0x40, R8 + ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU 768(CX), Y7 - VMOVDQU 800(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 832(CX), Y7 - VMOVDQU 864(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 896(CX), Y7 - VMOVDQU 928(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 64 bytes from input 5 to 3 outputs + // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 - VMOVDQU 32(R9), Y13 - ADDQ $0x40, R9 + ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU 960(CX), Y7 - VMOVDQU 992(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 1024(CX), Y7 - VMOVDQU 1056(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 1088(CX), Y7 - VMOVDQU 1120(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 64 bytes from input 6 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU 1152(CX), Y7 - VMOVDQU 1184(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 1216(CX), Y7 - VMOVDQU 1248(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 1280(CX), Y7 - VMOVDQU 1312(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Store 3 outputs - MOVQ (R10), R13 - VMOVDQU Y0, (R13)(R11*1) - VMOVDQU Y1, 32(R13)(R11*1) - MOVQ 24(R10), R13 - VMOVDQU Y2, (R13)(R11*1) - VMOVDQU Y3, 32(R13)(R11*1) - MOVQ 48(R10), R13 - VMOVDQU Y4, (R13)(R11*1) - VMOVDQU Y5, 32(R13)(R11*1) - - // Prepare for next loop - ADDQ $0x40, R11 - DECQ R12 - JNZ mulAvxTwo_7x3_64_loop + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_6x8Xor_loop VZEROUPPER -mulAvxTwo_7x3_64_end: +mulAvxTwo_6x8Xor_end: RET -// func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_7x4(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_6x9(SB), NOSPLIT, $0-88 // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 65 YMM used + // Destination kept on stack + // Full registers estimated 122 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_7x4_end + JZ mulAvxTwo_6x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R13 - MOVQ 48(R11), R14 - MOVQ 72(R11), R11 - MOVQ start+72(FP), R15 - - // Add start offset to output - ADDQ R15, R12 - ADDQ R15, R13 - ADDQ R15, R14 - ADDQ R15, R11 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X4 - VPBROADCASTB X4, Y4 - -mulAvxTwo_7x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X9 + VPBROADCASTB X9, Y9 - // Load and process 32 bytes from input 0 to 4 outputs - VMOVDQU (BX), Y7 +mulAvxTwo_6x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 ADDQ $0x20, BX - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU (CX), Y5 - VMOVDQU 32(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 64(CX), Y5 - VMOVDQU 96(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 128(CX), Y5 - VMOVDQU 160(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 192(CX), Y5 - VMOVDQU 224(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 - - // Load and process 32 bytes from input 1 to 4 outputs - VMOVDQU (SI), Y7 - ADDQ $0x20, SI - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 256(CX), Y5 - VMOVDQU 288(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 320(CX), Y5 - VMOVDQU 352(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 384(CX), Y5 - VMOVDQU 416(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 448(CX), Y5 - VMOVDQU 480(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 - // Load and process 32 bytes from input 2 to 4 outputs - VMOVDQU (DI), Y7 + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 ADDQ $0x20, DI - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 512(CX), Y5 - VMOVDQU 544(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 576(CX), Y5 - VMOVDQU 608(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 640(CX), Y5 - VMOVDQU 672(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 704(CX), Y5 - VMOVDQU 736(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 3 to 4 outputs - VMOVDQU (R8), Y7 + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 ADDQ $0x20, R8 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 768(CX), Y5 - VMOVDQU 800(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 832(CX), Y5 - VMOVDQU 864(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 896(CX), Y5 - VMOVDQU 928(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 960(CX), Y5 - VMOVDQU 992(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 4 to 4 outputs - VMOVDQU (R9), Y7 + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 ADDQ $0x20, R9 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1024(CX), Y5 - VMOVDQU 1056(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1088(CX), Y5 - VMOVDQU 1120(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1152(CX), Y5 - VMOVDQU 1184(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 1216(CX), Y5 - VMOVDQU 1248(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 - - // Load and process 32 bytes from input 5 to 4 outputs - VMOVDQU (R10), Y7 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1280(CX), Y5 - VMOVDQU 1312(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1344(CX), Y5 - VMOVDQU 1376(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1408(CX), Y5 - VMOVDQU 1440(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 1472(CX), Y5 - VMOVDQU 1504(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 6 to 4 outputs - VMOVDQU (DX), Y7 + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y12 ADDQ $0x20, DX - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1536(CX), Y5 - VMOVDQU 1568(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1600(CX), Y5 - VMOVDQU 1632(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1664(CX), Y5 - VMOVDQU 1696(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 1728(CX), Y5 - VMOVDQU 1760(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 - - // Store 4 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - VMOVDQU Y1, (R13) - ADDQ $0x20, R13 - VMOVDQU Y2, (R14) - ADDQ $0x20, R14 - VMOVDQU Y3, (R11) - ADDQ $0x20, R11 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Prepare for next loop + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 DECQ AX - JNZ mulAvxTwo_7x4_loop + JNZ mulAvxTwo_6x9_loop VZEROUPPER -mulAvxTwo_7x4_end: +mulAvxTwo_6x9_end: RET -// func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_7x5(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_6x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 80 YMM used + // Destination kept on stack + // Full registers estimated 122 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_7x5_end + JZ mulAvxTwo_6x9Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R13 - MOVQ 48(R11), R14 - MOVQ 72(R11), R15 - MOVQ 96(R11), R11 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R12 - ADDQ BP, R13 - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R11 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X5 - VPBROADCASTB X5, Y5 - -mulAvxTwo_7x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X9 + VPBROADCASTB X9, Y9 - // Load and process 32 bytes from input 0 to 5 outputs - VMOVDQU (BX), Y8 +mulAvxTwo_6x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 ADDQ $0x20, BX - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU (CX), Y6 - VMOVDQU 32(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 64(CX), Y6 - VMOVDQU 96(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 128(CX), Y6 - VMOVDQU 160(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 256(CX), Y6 - VMOVDQU 288(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 1 to 5 outputs - VMOVDQU (SI), Y8 + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 ADDQ $0x20, SI - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 320(CX), Y6 - VMOVDQU 352(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 384(CX), Y6 - VMOVDQU 416(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 448(CX), Y6 - VMOVDQU 480(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 512(CX), Y6 - VMOVDQU 544(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 576(CX), Y6 - VMOVDQU 608(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 2 to 5 outputs - VMOVDQU (DI), Y8 + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 ADDQ $0x20, DI - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 640(CX), Y6 - VMOVDQU 672(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 704(CX), Y6 - VMOVDQU 736(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 768(CX), Y6 - VMOVDQU 800(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 832(CX), Y6 - VMOVDQU 864(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 896(CX), Y6 - VMOVDQU 928(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 3 to 5 outputs - VMOVDQU (R8), Y8 + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 ADDQ $0x20, R8 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 960(CX), Y6 - VMOVDQU 992(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1024(CX), Y6 - VMOVDQU 1056(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1088(CX), Y6 - VMOVDQU 1120(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1152(CX), Y6 - VMOVDQU 1184(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1216(CX), Y6 - VMOVDQU 1248(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 4 to 5 outputs - VMOVDQU (R9), Y8 + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 ADDQ $0x20, R9 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 1280(CX), Y6 - VMOVDQU 1312(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1344(CX), Y6 - VMOVDQU 1376(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1408(CX), Y6 - VMOVDQU 1440(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1472(CX), Y6 - VMOVDQU 1504(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1536(CX), Y6 - VMOVDQU 1568(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 - - // Load and process 32 bytes from input 5 to 5 outputs - VMOVDQU (R10), Y8 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 1600(CX), Y6 - VMOVDQU 1632(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1664(CX), Y6 - VMOVDQU 1696(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1728(CX), Y6 - VMOVDQU 1760(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1792(CX), Y6 - VMOVDQU 1824(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1856(CX), Y6 - VMOVDQU 1888(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 6 to 5 outputs - VMOVDQU (DX), Y8 + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y12 ADDQ $0x20, DX - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 1920(CX), Y6 - VMOVDQU 1952(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1984(CX), Y6 - VMOVDQU 2016(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 2048(CX), Y6 - VMOVDQU 2080(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 2112(CX), Y6 - VMOVDQU 2144(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 2176(CX), Y6 - VMOVDQU 2208(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Store 5 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - VMOVDQU Y1, (R13) - ADDQ $0x20, R13 - VMOVDQU Y2, (R14) - ADDQ $0x20, R14 - VMOVDQU Y3, (R15) - ADDQ $0x20, R15 - VMOVDQU Y4, (R11) - ADDQ $0x20, R11 + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y8, (R12)(R11*1) // Prepare for next loop + ADDQ $0x20, R11 DECQ AX - JNZ mulAvxTwo_7x5_loop + JNZ mulAvxTwo_6x9Xor_loop VZEROUPPER -mulAvxTwo_7x5_end: +mulAvxTwo_6x9Xor_end: RET -// func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_7x6(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_6x10(SB), NOSPLIT, $0-88 // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 95 YMM used + // Destination kept on stack + // Full registers estimated 135 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_7x6_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), AX + JZ mulAvxTwo_6x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R12 - MOVQ 48(R10), R13 - MOVQ 72(R10), R14 - MOVQ 96(R10), R15 - MOVQ 120(R10), R10 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, R13 - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R10 + MOVQ start+72(FP), R11 // Add start offset to input - ADDQ BP, DX - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, AX - MOVQ $0x0000000f, BP - MOVQ BP, X6 - VPBROADCASTB X6, Y6 - MOVQ n+80(FP), BP - SHRQ $0x05, BP - -mulAvxTwo_7x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - - // Load and process 32 bytes from input 0 to 6 outputs - VMOVDQU (DX), Y9 - ADDQ $0x20, DX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU (CX), Y7 - VMOVDQU 32(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 64(CX), Y7 - VMOVDQU 96(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 128(CX), Y7 - VMOVDQU 160(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 192(CX), Y7 - VMOVDQU 224(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 256(CX), Y7 - VMOVDQU 288(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 320(CX), Y7 - VMOVDQU 352(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X10 + VPBROADCASTB X10, Y10 - // Load and process 32 bytes from input 1 to 6 outputs - VMOVDQU (BX), Y9 +mulAvxTwo_6x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 ADDQ $0x20, BX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 384(CX), Y7 - VMOVDQU 416(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 448(CX), Y7 - VMOVDQU 480(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 512(CX), Y7 - VMOVDQU 544(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 576(CX), Y7 - VMOVDQU 608(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 640(CX), Y7 - VMOVDQU 672(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 704(CX), Y7 - VMOVDQU 736(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 - // Load and process 32 bytes from input 2 to 6 outputs - VMOVDQU (SI), Y9 + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 ADDQ $0x20, SI - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 768(CX), Y7 - VMOVDQU 800(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 832(CX), Y7 - VMOVDQU 864(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 896(CX), Y7 - VMOVDQU 928(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 960(CX), Y7 - VMOVDQU 992(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 1024(CX), Y7 - VMOVDQU 1056(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 1088(CX), Y7 - VMOVDQU 1120(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 3 to 6 outputs - VMOVDQU (DI), Y9 + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 ADDQ $0x20, DI - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 1152(CX), Y7 - VMOVDQU 1184(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 1216(CX), Y7 - VMOVDQU 1248(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 1280(CX), Y7 - VMOVDQU 1312(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 1344(CX), Y7 - VMOVDQU 1376(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 1408(CX), Y7 - VMOVDQU 1440(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 1472(CX), Y7 - VMOVDQU 1504(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 4 to 6 outputs - VMOVDQU (R8), Y9 + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 ADDQ $0x20, R8 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 1536(CX), Y7 - VMOVDQU 1568(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 1600(CX), Y7 - VMOVDQU 1632(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 1664(CX), Y7 - VMOVDQU 1696(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 1728(CX), Y7 - VMOVDQU 1760(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 1792(CX), Y7 - VMOVDQU 1824(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 1856(CX), Y7 - VMOVDQU 1888(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 5 to 6 outputs - VMOVDQU (R9), Y9 + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 ADDQ $0x20, R9 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 1920(CX), Y7 - VMOVDQU 1952(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 1984(CX), Y7 - VMOVDQU 2016(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 2048(CX), Y7 - VMOVDQU 2080(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 2112(CX), Y7 - VMOVDQU 2144(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 2176(CX), Y7 - VMOVDQU 2208(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 2240(CX), Y7 - VMOVDQU 2272(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 6 to 6 outputs - VMOVDQU (AX), Y9 - ADDQ $0x20, AX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 2304(CX), Y7 - VMOVDQU 2336(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 2368(CX), Y7 - VMOVDQU 2400(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 2432(CX), Y7 - VMOVDQU 2464(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 2496(CX), Y7 - VMOVDQU 2528(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 2560(CX), Y7 - VMOVDQU 2592(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 2624(CX), Y7 - VMOVDQU 2656(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Store 6 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - VMOVDQU Y1, (R12) - ADDQ $0x20, R12 - VMOVDQU Y2, (R13) - ADDQ $0x20, R13 - VMOVDQU Y3, (R14) - ADDQ $0x20, R14 - VMOVDQU Y4, (R15) - ADDQ $0x20, R15 - VMOVDQU Y5, (R10) - ADDQ $0x20, R10 + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y9, (R12)(R11*1) // Prepare for next loop - DECQ BP - JNZ mulAvxTwo_7x6_loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_6x10_loop VZEROUPPER -mulAvxTwo_7x6_end: +mulAvxTwo_6x10_end: RET -// func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_7x7(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_6x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 110 YMM used + // Full registers estimated 135 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_7x7_end + JZ mulAvxTwo_6x10Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ start+72(FP), R12 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X7 - VPBROADCASTB X7, Y7 - -mulAvxTwo_7x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X10 + VPBROADCASTB X10, Y10 - // Load and process 32 bytes from input 0 to 7 outputs - VMOVDQU (BX), Y10 +mulAvxTwo_6x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 ADDQ $0x20, BX - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU (CX), Y8 - VMOVDQU 32(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 64(CX), Y8 - VMOVDQU 96(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 128(CX), Y8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + MOVQ 216(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_6x10Xor_loop + VZEROUPPER + +mulAvxTwo_6x10Xor_end: + RET + +// func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x1(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x1_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X1 + VPBROADCASTB X1, Y1 + +mulAvxTwo_7x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y4 + ADDQ $0x20, SI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y4 + ADDQ $0x20, DI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y4 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y4 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y4 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x1_loop + VZEROUPPER + +mulAvxTwo_7x1_end: + RET + +// func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_7x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x1_64_loop + VZEROUPPER + +mulAvxTwo_7x1_64_end: + RET + +// func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x1Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x1Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X1 + VPBROADCASTB X1, Y1 + +mulAvxTwo_7x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y4 + ADDQ $0x20, SI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y4 + ADDQ $0x20, DI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y4 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y4 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y4 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x1Xor_loop + VZEROUPPER + +mulAvxTwo_7x1Xor_end: + RET + +// func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_7x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R11), Y0 + VMOVDQU 32(R11), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_7x1_64Xor_end: + RET + +// func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x2(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x2_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_7x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y5 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y5 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x2_loop + VZEROUPPER + +mulAvxTwo_7x2_end: + RET + +// func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x2_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_7x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y2, (R11) + VMOVDQU Y3, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x2_64_loop + VZEROUPPER + +mulAvxTwo_7x2_64_end: + RET + +// func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x2Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x2Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_7x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y5 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y5 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x2Xor_loop + VZEROUPPER + +mulAvxTwo_7x2Xor_end: + RET + +// func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_7x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R12), Y0 + VMOVDQU 32(R12), Y1 + VMOVDQU (R11), Y2 + VMOVDQU 32(R11), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y2, (R11) + VMOVDQU Y3, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_7x2_64Xor_end: + RET + +// func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x3(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x3_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_7x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y6 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y6 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y6 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x3_loop + VZEROUPPER + +mulAvxTwo_7x3_end: + RET + +// func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x3_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 94 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x3_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_7x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y2, (R13) + VMOVDQU Y3, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y4, (R11) + VMOVDQU Y5, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x3_64_loop + VZEROUPPER + +mulAvxTwo_7x3_64_end: + RET + +// func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x3Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x3Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_7x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R13), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y6 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y6 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y6 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x3Xor_loop + VZEROUPPER + +mulAvxTwo_7x3Xor_end: + RET + +// func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 94 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_7x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R12), Y0 + VMOVDQU 32(R12), Y1 + VMOVDQU (R13), Y2 + VMOVDQU 32(R13), Y3 + VMOVDQU (R11), Y4 + VMOVDQU 32(R11), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y2, (R13) + VMOVDQU Y3, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y4, (R11) + VMOVDQU Y5, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_7x3_64Xor_end: + RET + +// func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x4(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x4_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_7x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x4_loop + VZEROUPPER + +mulAvxTwo_7x4_end: + RET + +// func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_7x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R13), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R14), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x4Xor_loop + VZEROUPPER + +mulAvxTwo_7x4Xor_end: + RET + +// func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x5(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 80 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_7x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R15) + ADDQ $0x20, R15 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x5_loop + VZEROUPPER + +mulAvxTwo_7x5_end: + RET + +// func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x5Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 80 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_7x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU (R13), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU (R14), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU (R15), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R15) + ADDQ $0x20, R15 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x5Xor_loop + VZEROUPPER + +mulAvxTwo_7x5Xor_end: + RET + +// func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x6(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 95 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x6_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_7x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y9 + ADDQ $0x20, AX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R15) + ADDQ $0x20, R15 + VMOVDQU Y5, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_7x6_loop + VZEROUPPER + +mulAvxTwo_7x6_end: + RET + +// func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x6Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 95 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x6Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_7x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU (R14), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU (R15), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU (R10), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y9 + ADDQ $0x20, AX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R15) + ADDQ $0x20, R15 + VMOVDQU Y5, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_7x6Xor_loop + VZEROUPPER + +mulAvxTwo_7x6Xor_end: + RET + +// func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x7(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x7_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_7x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x7_loop + VZEROUPPER + +mulAvxTwo_7x7_end: + RET + +// func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_7x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x7Xor_loop + VZEROUPPER + +mulAvxTwo_7x7Xor_end: + RET + +// func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x8(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 125 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x8_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_7x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x8_loop + VZEROUPPER + +mulAvxTwo_7x8_end: + RET + +// func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 125 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_7x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x8Xor_loop + VZEROUPPER + +mulAvxTwo_7x8Xor_end: + RET + +// func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x9(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 140 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x9_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_7x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x9_loop + VZEROUPPER + +mulAvxTwo_7x9_end: + RET + +// func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 140 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_7x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x9Xor_loop + VZEROUPPER + +mulAvxTwo_7x9Xor_end: + RET + +// func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x10(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 155 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_7x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x10_loop + VZEROUPPER + +mulAvxTwo_7x10_end: + RET + +// func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 155 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_7x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + MOVQ 216(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x10Xor_loop + VZEROUPPER + +mulAvxTwo_7x10Xor_end: + RET + +// func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x1(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x1_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X1 + VPBROADCASTB X1, Y1 + +mulAvxTwo_8x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y4 + ADDQ $0x20, SI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y4 + ADDQ $0x20, DI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y4 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y4 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y4 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y4 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 448(CX), Y2 + VMOVDQU 480(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x1_loop + VZEROUPPER + +mulAvxTwo_8x1_end: + RET + +// func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_8x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x1_64_loop + VZEROUPPER + +mulAvxTwo_8x1_64_end: + RET + +// func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x1Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x1Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X1 + VPBROADCASTB X1, Y1 + +mulAvxTwo_8x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y4 + ADDQ $0x20, SI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y4 + ADDQ $0x20, DI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y4 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y4 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y4 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y4 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 448(CX), Y2 + VMOVDQU 480(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x1Xor_loop + VZEROUPPER + +mulAvxTwo_8x1Xor_end: + RET + +// func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_8x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R12), Y0 + VMOVDQU 32(R12), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_8x1_64Xor_end: + RET + +// func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x2(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 39 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x2_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_8x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y5 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y5 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y5 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x2_loop + VZEROUPPER + +mulAvxTwo_8x2_end: + RET + +// func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x2_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 73 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_8x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y2, (R12) + VMOVDQU Y3, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x2_64_loop + VZEROUPPER + +mulAvxTwo_8x2_64_end: + RET + +// func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x2Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 39 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x2Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_8x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R13), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y5 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y5 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y5 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x2Xor_loop + VZEROUPPER + +mulAvxTwo_8x2Xor_end: + RET + +// func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 73 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_8x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R13), Y0 + VMOVDQU 32(R13), Y1 + VMOVDQU (R12), Y2 + VMOVDQU 32(R12), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y2, (R12) + VMOVDQU Y3, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_8x2_64Xor_end: + RET + +// func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x3(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x3_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_8x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y6 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y6 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y6 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y6 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1344(CX), Y4 + VMOVDQU 1376(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1408(CX), Y4 + VMOVDQU 1440(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1472(CX), Y4 + VMOVDQU 1504(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x3_loop + VZEROUPPER + +mulAvxTwo_8x3_end: + RET + +// func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x3_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 106 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x3_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_8x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y2, (R14) + VMOVDQU Y3, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y4, (R12) + VMOVDQU Y5, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x3_64_loop + VZEROUPPER + +mulAvxTwo_8x3_64_end: + RET + +// func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x3Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x3Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_8x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R13), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R14), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y6 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y6 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y6 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y6 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1344(CX), Y4 + VMOVDQU 1376(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1408(CX), Y4 + VMOVDQU 1440(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1472(CX), Y4 + VMOVDQU 1504(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x3Xor_loop + VZEROUPPER + +mulAvxTwo_8x3Xor_end: + RET + +// func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 106 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_8x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R13), Y0 + VMOVDQU 32(R13), Y1 + VMOVDQU (R14), Y2 + VMOVDQU 32(R14), Y3 + VMOVDQU (R12), Y4 + VMOVDQU 32(R12), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y2, (R14) + VMOVDQU Y3, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y4, (R12) + VMOVDQU Y5, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_8x3_64Xor_end: + RET + +// func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x4(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 73 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x4_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_8x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y7 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + VMOVDQU Y2, (R15) + ADDQ $0x20, R15 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x4_loop + VZEROUPPER + +mulAvxTwo_8x4_end: + RET + +// func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x4Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 73 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_8x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R13), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R14), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R15), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y7 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + VMOVDQU Y2, (R15) + ADDQ $0x20, R15 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x4Xor_loop + VZEROUPPER + +mulAvxTwo_8x4Xor_end: + RET + +// func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x5(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x5_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_8x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y8 + ADDQ $0x20, AX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R15) + ADDQ $0x20, R15 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_8x5_loop + VZEROUPPER + +mulAvxTwo_8x5_end: + RET + +// func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x5Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x5Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_8x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU (R13), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU (R14), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU (R15), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y8 + ADDQ $0x20, AX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R15) + ADDQ $0x20, R15 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_8x5Xor_loop + VZEROUPPER + +mulAvxTwo_8x5Xor_end: + RET + +// func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x6(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 107 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x6_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_8x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y9 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x6_loop + VZEROUPPER + +mulAvxTwo_8x6_end: + RET + +// func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 107 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_8x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y9 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x6Xor_loop + VZEROUPPER + +mulAvxTwo_8x6Xor_end: + RET + +// func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x7(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 124 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x7_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_8x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x7_loop + VZEROUPPER + +mulAvxTwo_8x7_end: + RET + +// func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 124 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_8x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 192(CX), Y8 - VMOVDQU 224(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x7Xor_loop + VZEROUPPER + +mulAvxTwo_8x7Xor_end: + RET + +// func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x8(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 141 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x8_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_8x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x8_loop + VZEROUPPER + +mulAvxTwo_8x8_end: + RET + +// func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 141 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_8x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 320(CX), Y8 - VMOVDQU 352(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 384(CX), Y8 - VMOVDQU 416(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x8Xor_loop + VZEROUPPER + +mulAvxTwo_8x8Xor_end: + RET + +// func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x9(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 158 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x9_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_8x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x9_loop + VZEROUPPER + +mulAvxTwo_8x9_end: + RET + +// func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 158 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_8x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x9Xor_loop + VZEROUPPER + +mulAvxTwo_8x9Xor_end: + RET + +// func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x10(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 175 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_8x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y13 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 4480(CX), Y11 + VMOVDQU 4512(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 4544(CX), Y11 + VMOVDQU 4576(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 4608(CX), Y11 + VMOVDQU 4640(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4672(CX), Y11 + VMOVDQU 4704(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4736(CX), Y11 + VMOVDQU 4768(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4800(CX), Y11 + VMOVDQU 4832(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4864(CX), Y11 + VMOVDQU 4896(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4928(CX), Y11 + VMOVDQU 4960(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4992(CX), Y11 + VMOVDQU 5024(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 5056(CX), Y11 + VMOVDQU 5088(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x10_loop + VZEROUPPER + +mulAvxTwo_8x10_end: + RET + +// func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 175 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_8x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + MOVQ 216(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y13 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 4480(CX), Y11 + VMOVDQU 4512(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 4544(CX), Y11 + VMOVDQU 4576(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 4608(CX), Y11 + VMOVDQU 4640(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4672(CX), Y11 + VMOVDQU 4704(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4736(CX), Y11 + VMOVDQU 4768(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4800(CX), Y11 + VMOVDQU 4832(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4864(CX), Y11 + VMOVDQU 4896(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4928(CX), Y11 + VMOVDQU 4960(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4992(CX), Y11 + VMOVDQU 5024(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 5056(CX), Y11 + VMOVDQU 5088(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x10Xor_loop + VZEROUPPER + +mulAvxTwo_8x10Xor_end: + RET + +// func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x1(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x1_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X1 + VPBROADCASTB X1, Y1 + +mulAvxTwo_9x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y4 + ADDQ $0x20, SI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y4 + ADDQ $0x20, DI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y4 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y4 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y4 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y4 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R12), Y4 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 448(CX), Y2 + VMOVDQU 480(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 512(CX), Y2 + VMOVDQU 544(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x1_loop + VZEROUPPER + +mulAvxTwo_9x1_end: + RET + +// func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_9x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_9x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU (R12), Y6 + VMOVDQU 32(R12), Y5 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x1_64_loop + VZEROUPPER + +mulAvxTwo_9x1_64_end: + RET + +// func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x1Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x1Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X1 + VPBROADCASTB X1, Y1 + +mulAvxTwo_9x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (R13), Y0 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y4 + ADDQ $0x20, SI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y4 + ADDQ $0x20, DI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y4 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y4 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y4 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y4 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R12), Y4 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 448(CX), Y2 + VMOVDQU 480(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 512(CX), Y2 + VMOVDQU 544(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x1Xor_loop + VZEROUPPER + +mulAvxTwo_9x1Xor_end: + RET + +// func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_9x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_9x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R13), Y0 + VMOVDQU 32(R13), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU (R12), Y6 + VMOVDQU 32(R12), Y5 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_9x1_64Xor_end: + RET + +// func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x2(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 43 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x2_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_9x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y5 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y5 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y5 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y5 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 1024(CX), Y3 + VMOVDQU 1056(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 1088(CX), Y3 + VMOVDQU 1120(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R14) + ADDQ $0x20, R14 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x2_loop + VZEROUPPER + +mulAvxTwo_9x2_end: + RET + +// func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x2_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 81 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_9x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_9x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y9 + VMOVDQU 32(R12), Y11 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R13) + VMOVDQU Y3, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x2_64_loop + VZEROUPPER + +mulAvxTwo_9x2_64_end: + RET + +// func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x2Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 43 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x2Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_9x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R14), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (R13), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y5 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y5 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y5 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y5 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 1024(CX), Y3 + VMOVDQU 1056(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 1088(CX), Y3 + VMOVDQU 1120(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R14) + ADDQ $0x20, R14 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x2Xor_loop + VZEROUPPER + +mulAvxTwo_9x2Xor_end: + RET + +// func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 81 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_9x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_9x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R14), Y0 + VMOVDQU 32(R14), Y1 + VMOVDQU (R13), Y2 + VMOVDQU 32(R13), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y9 + VMOVDQU 32(R12), Y11 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 - // Load and process 32 bytes from input 1 to 7 outputs - VMOVDQU (SI), Y10 + // Store 2 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R13) + VMOVDQU Y3, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_9x2_64Xor_end: + RET + +// func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x3(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x3_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_9x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 ADDQ $0x20, SI - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 448(CX), Y8 - VMOVDQU 480(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 512(CX), Y8 - VMOVDQU 544(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 576(CX), Y8 - VMOVDQU 608(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 640(CX), Y8 - VMOVDQU 672(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 704(CX), Y8 - VMOVDQU 736(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 768(CX), Y8 - VMOVDQU 800(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 832(CX), Y8 - VMOVDQU 864(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 2 to 7 outputs - VMOVDQU (DI), Y10 + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 ADDQ $0x20, DI - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 896(CX), Y8 - VMOVDQU 928(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 960(CX), Y8 - VMOVDQU 992(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1024(CX), Y8 - VMOVDQU 1056(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1088(CX), Y8 - VMOVDQU 1120(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 1152(CX), Y8 - VMOVDQU 1184(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 1216(CX), Y8 - VMOVDQU 1248(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 1280(CX), Y8 - VMOVDQU 1312(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 3 to 7 outputs - VMOVDQU (R8), Y10 + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y6 ADDQ $0x20, R8 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 1344(CX), Y8 - VMOVDQU 1376(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 1408(CX), Y8 - VMOVDQU 1440(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1472(CX), Y8 - VMOVDQU 1504(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1536(CX), Y8 - VMOVDQU 1568(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 1600(CX), Y8 - VMOVDQU 1632(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 1664(CX), Y8 - VMOVDQU 1696(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 1728(CX), Y8 - VMOVDQU 1760(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 4 to 7 outputs - VMOVDQU (R9), Y10 + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y6 ADDQ $0x20, R9 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 1792(CX), Y8 - VMOVDQU 1824(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 1856(CX), Y8 - VMOVDQU 1888(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1920(CX), Y8 - VMOVDQU 1952(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1984(CX), Y8 - VMOVDQU 2016(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 2048(CX), Y8 - VMOVDQU 2080(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 2112(CX), Y8 - VMOVDQU 2144(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 2176(CX), Y8 - VMOVDQU 2208(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 5 to 7 outputs - VMOVDQU (R10), Y10 + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y6 ADDQ $0x20, R10 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 2240(CX), Y8 - VMOVDQU 2272(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 2304(CX), Y8 - VMOVDQU 2336(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 2368(CX), Y8 - VMOVDQU 2400(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 2432(CX), Y8 - VMOVDQU 2464(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 2496(CX), Y8 - VMOVDQU 2528(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 2560(CX), Y8 - VMOVDQU 2592(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 2624(CX), Y8 - VMOVDQU 2656(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 6 to 7 outputs - VMOVDQU (DX), Y10 - ADDQ $0x20, DX - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 2688(CX), Y8 - VMOVDQU 2720(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 2752(CX), Y8 - VMOVDQU 2784(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 2816(CX), Y8 - VMOVDQU 2848(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 2880(CX), Y8 - VMOVDQU 2912(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 2944(CX), Y8 - VMOVDQU 2976(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 3008(CX), Y8 - VMOVDQU 3040(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 3072(CX), Y8 - VMOVDQU 3104(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y6 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Store 7 outputs - MOVQ (R11), R13 - VMOVDQU Y0, (R13)(R12*1) - MOVQ 24(R11), R13 - VMOVDQU Y1, (R13)(R12*1) - MOVQ 48(R11), R13 - VMOVDQU Y2, (R13)(R12*1) - MOVQ 72(R11), R13 - VMOVDQU Y3, (R13)(R12*1) - MOVQ 96(R11), R13 - VMOVDQU Y4, (R13)(R12*1) - MOVQ 120(R11), R13 - VMOVDQU Y5, (R13)(R12*1) - MOVQ 144(R11), R13 - VMOVDQU Y6, (R13)(R12*1) + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y6 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1344(CX), Y4 + VMOVDQU 1376(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1408(CX), Y4 + VMOVDQU 1440(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1472(CX), Y4 + VMOVDQU 1504(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1536(CX), Y4 + VMOVDQU 1568(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1600(CX), Y4 + VMOVDQU 1632(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1664(CX), Y4 + VMOVDQU 1696(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R14) + ADDQ $0x20, R14 + VMOVDQU Y1, (R15) + ADDQ $0x20, R15 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 // Prepare for next loop - ADDQ $0x20, R12 DECQ AX - JNZ mulAvxTwo_7x7_loop + JNZ mulAvxTwo_9x3_loop VZEROUPPER -mulAvxTwo_7x7_end: +mulAvxTwo_9x3_end: RET -// func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_7x8(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_9x3_64(SB), $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 125 YMM used + // Destination kept in GP registers + // Full registers estimated 118 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX + SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_7x8_end + JZ mulAvxTwo_9x3_64_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -18233,456 +50224,720 @@ TEXT ·mulAvxTwo_7x8(SB), NOSPLIT, $0-88 MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ start+72(FP), R12 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X8 - VPBROADCASTB X8, Y8 + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 -mulAvxTwo_7x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 - // Load and process 32 bytes from input 0 to 8 outputs +mulAvxTwo_9x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (BX), Y11 - ADDQ $0x20, BX + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU (CX), Y9 - VMOVDQU 32(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 64(CX), Y9 - VMOVDQU 96(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 128(CX), Y9 - VMOVDQU 160(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 192(CX), Y9 - VMOVDQU 224(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 256(CX), Y9 - VMOVDQU 288(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 320(CX), Y9 - VMOVDQU 352(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 384(CX), Y9 - VMOVDQU 416(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 448(CX), Y9 - VMOVDQU 480(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 1 to 8 outputs - VMOVDQU (SI), Y11 - ADDQ $0x20, SI + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 512(CX), Y9 - VMOVDQU 544(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 576(CX), Y9 - VMOVDQU 608(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 640(CX), Y9 - VMOVDQU 672(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 704(CX), Y9 - VMOVDQU 736(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 768(CX), Y9 - VMOVDQU 800(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 832(CX), Y9 - VMOVDQU 864(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 896(CX), Y9 - VMOVDQU 928(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 960(CX), Y9 - VMOVDQU 992(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 2 to 8 outputs - VMOVDQU (DI), Y11 - ADDQ $0x20, DI + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 1024(CX), Y9 - VMOVDQU 1056(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 1088(CX), Y9 - VMOVDQU 1120(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 1152(CX), Y9 - VMOVDQU 1184(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 1216(CX), Y9 - VMOVDQU 1248(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 1280(CX), Y9 - VMOVDQU 1312(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 1344(CX), Y9 - VMOVDQU 1376(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 1408(CX), Y9 - VMOVDQU 1440(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 1472(CX), Y9 - VMOVDQU 1504(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 3 to 8 outputs - VMOVDQU (R8), Y11 - ADDQ $0x20, R8 + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 1536(CX), Y9 - VMOVDQU 1568(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 1600(CX), Y9 - VMOVDQU 1632(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 1664(CX), Y9 - VMOVDQU 1696(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 1728(CX), Y9 - VMOVDQU 1760(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 1792(CX), Y9 - VMOVDQU 1824(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 1856(CX), Y9 - VMOVDQU 1888(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 1920(CX), Y9 - VMOVDQU 1952(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 1984(CX), Y9 - VMOVDQU 2016(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 4 to 8 outputs - VMOVDQU (R9), Y11 - ADDQ $0x20, R9 + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 2048(CX), Y9 - VMOVDQU 2080(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 2112(CX), Y9 - VMOVDQU 2144(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 2176(CX), Y9 - VMOVDQU 2208(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 2240(CX), Y9 - VMOVDQU 2272(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 2304(CX), Y9 - VMOVDQU 2336(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 2368(CX), Y9 - VMOVDQU 2400(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 2432(CX), Y9 - VMOVDQU 2464(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 2496(CX), Y9 - VMOVDQU 2528(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 5 to 8 outputs - VMOVDQU (R10), Y11 - ADDQ $0x20, R10 + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU 32(R12), Y13 + ADDQ $0x40, R12 VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 2560(CX), Y9 - VMOVDQU 2592(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 2624(CX), Y9 - VMOVDQU 2656(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 2688(CX), Y9 - VMOVDQU 2720(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 2752(CX), Y9 - VMOVDQU 2784(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 2816(CX), Y9 - VMOVDQU 2848(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 2880(CX), Y9 - VMOVDQU 2912(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 2944(CX), Y9 - VMOVDQU 2976(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 3008(CX), Y9 - VMOVDQU 3040(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 6 to 8 outputs + // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (DX), Y11 - ADDQ $0x20, DX + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 3072(CX), Y9 - VMOVDQU 3104(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 3136(CX), Y9 - VMOVDQU 3168(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 3200(CX), Y9 - VMOVDQU 3232(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 3264(CX), Y9 - VMOVDQU 3296(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 3328(CX), Y9 - VMOVDQU 3360(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 3392(CX), Y9 - VMOVDQU 3424(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 3456(CX), Y9 - VMOVDQU 3488(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 3520(CX), Y9 - VMOVDQU 3552(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - // Store 8 outputs - MOVQ (R11), R13 - VMOVDQU Y0, (R13)(R12*1) - MOVQ 24(R11), R13 - VMOVDQU Y1, (R13)(R12*1) - MOVQ 48(R11), R13 - VMOVDQU Y2, (R13)(R12*1) - MOVQ 72(R11), R13 - VMOVDQU Y3, (R13)(R12*1) - MOVQ 96(R11), R13 - VMOVDQU Y4, (R13)(R12*1) - MOVQ 120(R11), R13 - VMOVDQU Y5, (R13)(R12*1) - MOVQ 144(R11), R13 - VMOVDQU Y6, (R13)(R12*1) - MOVQ 168(R11), R13 - VMOVDQU Y7, (R13)(R12*1) + // Store 3 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R15) + VMOVDQU Y3, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y4, (R13) + VMOVDQU Y5, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x3_64_loop + VZEROUPPER + +mulAvxTwo_9x3_64_end: + RET + +// func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x3Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x3Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_9x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R14), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R15), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y6 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y6 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y6 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y6 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y6 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1344(CX), Y4 + VMOVDQU 1376(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1408(CX), Y4 + VMOVDQU 1440(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1472(CX), Y4 + VMOVDQU 1504(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1536(CX), Y4 + VMOVDQU 1568(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1600(CX), Y4 + VMOVDQU 1632(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1664(CX), Y4 + VMOVDQU 1696(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R14) + ADDQ $0x20, R14 + VMOVDQU Y1, (R15) + ADDQ $0x20, R15 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 // Prepare for next loop - ADDQ $0x20, R12 DECQ AX - JNZ mulAvxTwo_7x8_loop + JNZ mulAvxTwo_9x3Xor_loop VZEROUPPER -mulAvxTwo_7x8_end: +mulAvxTwo_9x3Xor_end: RET -// func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_7x9(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_9x3_64Xor(SB), $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 140 YMM used + // Destination kept in GP registers + // Full registers estimated 118 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX + SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_7x9_end + JZ mulAvxTwo_9x3_64Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -18690,1434 +50945,1527 @@ TEXT ·mulAvxTwo_7x9(SB), NOSPLIT, $0-88 MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ start+72(FP), R12 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X9 - VPBROADCASTB X9, Y9 + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 -mulAvxTwo_7x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 +mulAvxTwo_9x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y0 + VMOVDQU 32(R14), Y1 + VMOVDQU (R15), Y2 + VMOVDQU 32(R15), Y3 + VMOVDQU (R13), Y4 + VMOVDQU 32(R13), Y5 - // Load and process 32 bytes from input 0 to 9 outputs - VMOVDQU (BX), Y12 - ADDQ $0x20, BX - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU (CX), Y10 - VMOVDQU 32(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 64(CX), Y10 - VMOVDQU 96(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 128(CX), Y10 - VMOVDQU 160(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 192(CX), Y10 - VMOVDQU 224(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 256(CX), Y10 - VMOVDQU 288(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 320(CX), Y10 - VMOVDQU 352(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 384(CX), Y10 - VMOVDQU 416(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 448(CX), Y10 - VMOVDQU 480(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 512(CX), Y10 - VMOVDQU 544(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 1 to 9 outputs - VMOVDQU (SI), Y12 - ADDQ $0x20, SI - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 576(CX), Y10 - VMOVDQU 608(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 640(CX), Y10 - VMOVDQU 672(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 704(CX), Y10 - VMOVDQU 736(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 768(CX), Y10 - VMOVDQU 800(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 832(CX), Y10 - VMOVDQU 864(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 896(CX), Y10 - VMOVDQU 928(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 960(CX), Y10 - VMOVDQU 992(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 1024(CX), Y10 - VMOVDQU 1056(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 1088(CX), Y10 - VMOVDQU 1120(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 2 to 9 outputs - VMOVDQU (DI), Y12 - ADDQ $0x20, DI - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 1152(CX), Y10 - VMOVDQU 1184(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 1216(CX), Y10 - VMOVDQU 1248(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 1280(CX), Y10 - VMOVDQU 1312(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 1344(CX), Y10 - VMOVDQU 1376(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 1408(CX), Y10 - VMOVDQU 1440(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 1472(CX), Y10 - VMOVDQU 1504(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 1536(CX), Y10 - VMOVDQU 1568(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 1600(CX), Y10 - VMOVDQU 1632(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 1664(CX), Y10 - VMOVDQU 1696(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 3 to 9 outputs - VMOVDQU (R8), Y12 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 1728(CX), Y10 - VMOVDQU 1760(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 1792(CX), Y10 - VMOVDQU 1824(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 1856(CX), Y10 - VMOVDQU 1888(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 1920(CX), Y10 - VMOVDQU 1952(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 1984(CX), Y10 - VMOVDQU 2016(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 2048(CX), Y10 - VMOVDQU 2080(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 2112(CX), Y10 - VMOVDQU 2144(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 2176(CX), Y10 - VMOVDQU 2208(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 2240(CX), Y10 - VMOVDQU 2272(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 4 to 9 outputs - VMOVDQU (R9), Y12 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 2304(CX), Y10 - VMOVDQU 2336(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 2368(CX), Y10 - VMOVDQU 2400(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 2432(CX), Y10 - VMOVDQU 2464(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 2496(CX), Y10 - VMOVDQU 2528(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 2560(CX), Y10 - VMOVDQU 2592(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 2624(CX), Y10 - VMOVDQU 2656(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 2688(CX), Y10 - VMOVDQU 2720(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 2752(CX), Y10 - VMOVDQU 2784(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 2816(CX), Y10 - VMOVDQU 2848(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 5 to 9 outputs - VMOVDQU (R10), Y12 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 2880(CX), Y10 - VMOVDQU 2912(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 2944(CX), Y10 - VMOVDQU 2976(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 3008(CX), Y10 - VMOVDQU 3040(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 3072(CX), Y10 - VMOVDQU 3104(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 3136(CX), Y10 - VMOVDQU 3168(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 3200(CX), Y10 - VMOVDQU 3232(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 3264(CX), Y10 - VMOVDQU 3296(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 3328(CX), Y10 - VMOVDQU 3360(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 3392(CX), Y10 - VMOVDQU 3424(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 6 to 9 outputs - VMOVDQU (DX), Y12 - ADDQ $0x20, DX - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 3456(CX), Y10 - VMOVDQU 3488(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 3520(CX), Y10 - VMOVDQU 3552(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 3584(CX), Y10 - VMOVDQU 3616(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 3648(CX), Y10 - VMOVDQU 3680(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 3712(CX), Y10 - VMOVDQU 3744(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 3776(CX), Y10 - VMOVDQU 3808(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 3840(CX), Y10 - VMOVDQU 3872(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 3904(CX), Y10 - VMOVDQU 3936(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 3968(CX), Y10 - VMOVDQU 4000(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU 32(R12), Y13 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Store 9 outputs - MOVQ (R11), R13 - VMOVDQU Y0, (R13)(R12*1) - MOVQ 24(R11), R13 - VMOVDQU Y1, (R13)(R12*1) - MOVQ 48(R11), R13 - VMOVDQU Y2, (R13)(R12*1) - MOVQ 72(R11), R13 - VMOVDQU Y3, (R13)(R12*1) - MOVQ 96(R11), R13 - VMOVDQU Y4, (R13)(R12*1) - MOVQ 120(R11), R13 - VMOVDQU Y5, (R13)(R12*1) - MOVQ 144(R11), R13 - VMOVDQU Y6, (R13)(R12*1) - MOVQ 168(R11), R13 - VMOVDQU Y7, (R13)(R12*1) - MOVQ 192(R11), R13 - VMOVDQU Y8, (R13)(R12*1) + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R15) + VMOVDQU Y3, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y4, (R13) + VMOVDQU Y5, 32(R13) + ADDQ $0x40, R13 // Prepare for next loop - ADDQ $0x20, R12 DECQ AX - JNZ mulAvxTwo_7x9_loop + JNZ mulAvxTwo_9x3_64Xor_loop VZEROUPPER -mulAvxTwo_7x9_end: +mulAvxTwo_9x3_64Xor_end: RET -// func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_7x10(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_9x4(SB), NOSPLIT, $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 155 YMM used + // Destination kept in GP registers + // Full registers estimated 81 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_7x10_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ start+72(FP), R12 - - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X10 - VPBROADCASTB X10, Y10 + JZ mulAvxTwo_9x4_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP -mulAvxTwo_7x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 - // Load and process 32 bytes from input 0 to 10 outputs - VMOVDQU (BX), Y13 - ADDQ $0x20, BX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU (CX), Y11 - VMOVDQU 32(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 64(CX), Y11 - VMOVDQU 96(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 128(CX), Y11 - VMOVDQU 160(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 192(CX), Y11 - VMOVDQU 224(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 256(CX), Y11 - VMOVDQU 288(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 320(CX), Y11 - VMOVDQU 352(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 384(CX), Y11 - VMOVDQU 416(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 448(CX), Y11 - VMOVDQU 480(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 512(CX), Y11 - VMOVDQU 544(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 576(CX), Y11 - VMOVDQU 608(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + MOVQ n+80(FP), BP + SHRQ $0x05, BP - // Load and process 32 bytes from input 1 to 10 outputs - VMOVDQU (SI), Y13 +mulAvxTwo_9x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y7 ADDQ $0x20, SI - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 640(CX), Y11 - VMOVDQU 672(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 704(CX), Y11 - VMOVDQU 736(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 768(CX), Y11 - VMOVDQU 800(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 832(CX), Y11 - VMOVDQU 864(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 896(CX), Y11 - VMOVDQU 928(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 960(CX), Y11 - VMOVDQU 992(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 1024(CX), Y11 - VMOVDQU 1056(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 1088(CX), Y11 - VMOVDQU 1120(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 1152(CX), Y11 - VMOVDQU 1184(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 1216(CX), Y11 - VMOVDQU 1248(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 2 to 10 outputs - VMOVDQU (DI), Y13 + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y7 ADDQ $0x20, DI - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 1280(CX), Y11 - VMOVDQU 1312(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 1344(CX), Y11 - VMOVDQU 1376(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 1408(CX), Y11 - VMOVDQU 1440(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 1472(CX), Y11 - VMOVDQU 1504(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 1536(CX), Y11 - VMOVDQU 1568(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 1600(CX), Y11 - VMOVDQU 1632(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 1664(CX), Y11 - VMOVDQU 1696(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 1728(CX), Y11 - VMOVDQU 1760(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 1792(CX), Y11 - VMOVDQU 1824(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 1856(CX), Y11 - VMOVDQU 1888(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 3 to 10 outputs - VMOVDQU (R8), Y13 + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y7 ADDQ $0x20, R8 - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 1920(CX), Y11 - VMOVDQU 1952(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 1984(CX), Y11 - VMOVDQU 2016(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 2048(CX), Y11 - VMOVDQU 2080(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 2112(CX), Y11 - VMOVDQU 2144(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 2176(CX), Y11 - VMOVDQU 2208(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 2240(CX), Y11 - VMOVDQU 2272(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 2304(CX), Y11 - VMOVDQU 2336(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 2368(CX), Y11 - VMOVDQU 2400(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 2432(CX), Y11 - VMOVDQU 2464(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 2496(CX), Y11 - VMOVDQU 2528(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 4 to 10 outputs - VMOVDQU (R9), Y13 + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y7 ADDQ $0x20, R9 - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 2560(CX), Y11 - VMOVDQU 2592(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 2624(CX), Y11 - VMOVDQU 2656(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 2688(CX), Y11 - VMOVDQU 2720(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 2752(CX), Y11 - VMOVDQU 2784(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 2816(CX), Y11 - VMOVDQU 2848(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 2880(CX), Y11 - VMOVDQU 2912(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 2944(CX), Y11 - VMOVDQU 2976(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 3008(CX), Y11 - VMOVDQU 3040(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 3072(CX), Y11 - VMOVDQU 3104(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 3136(CX), Y11 - VMOVDQU 3168(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 5 to 10 outputs - VMOVDQU (R10), Y13 + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y7 ADDQ $0x20, R10 - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 3200(CX), Y11 - VMOVDQU 3232(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 3264(CX), Y11 - VMOVDQU 3296(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 3328(CX), Y11 - VMOVDQU 3360(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 3392(CX), Y11 - VMOVDQU 3424(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 3456(CX), Y11 - VMOVDQU 3488(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 3520(CX), Y11 - VMOVDQU 3552(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 3584(CX), Y11 - VMOVDQU 3616(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 3648(CX), Y11 - VMOVDQU 3680(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 3712(CX), Y11 - VMOVDQU 3744(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 3776(CX), Y11 - VMOVDQU 3808(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 6 to 10 outputs - VMOVDQU (DX), Y13 - ADDQ $0x20, DX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 3840(CX), Y11 - VMOVDQU 3872(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 3904(CX), Y11 - VMOVDQU 3936(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 3968(CX), Y11 - VMOVDQU 4000(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 4032(CX), Y11 - VMOVDQU 4064(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 4096(CX), Y11 - VMOVDQU 4128(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 4160(CX), Y11 - VMOVDQU 4192(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 4224(CX), Y11 - VMOVDQU 4256(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 4288(CX), Y11 - VMOVDQU 4320(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 4352(CX), Y11 - VMOVDQU 4384(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 4416(CX), Y11 - VMOVDQU 4448(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y7 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Store 10 outputs - MOVQ (R11), R13 - VMOVDQU Y0, (R13)(R12*1) - MOVQ 24(R11), R13 - VMOVDQU Y1, (R13)(R12*1) - MOVQ 48(R11), R13 - VMOVDQU Y2, (R13)(R12*1) - MOVQ 72(R11), R13 - VMOVDQU Y3, (R13)(R12*1) - MOVQ 96(R11), R13 - VMOVDQU Y4, (R13)(R12*1) - MOVQ 120(R11), R13 - VMOVDQU Y5, (R13)(R12*1) - MOVQ 144(R11), R13 - VMOVDQU Y6, (R13)(R12*1) - MOVQ 168(R11), R13 - VMOVDQU Y7, (R13)(R12*1) - MOVQ 192(R11), R13 - VMOVDQU Y8, (R13)(R12*1) - MOVQ 216(R11), R13 - VMOVDQU Y9, (R13)(R12*1) + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y7 + ADDQ $0x20, AX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2048(CX), Y5 + VMOVDQU 2080(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 2112(CX), Y5 + VMOVDQU 2144(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 2176(CX), Y5 + VMOVDQU 2208(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 2240(CX), Y5 + VMOVDQU 2272(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + VMOVDQU Y2, (R15) + ADDQ $0x20, R15 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 // Prepare for next loop - ADDQ $0x20, R12 - DECQ AX - JNZ mulAvxTwo_7x10_loop + DECQ BP + JNZ mulAvxTwo_9x4_loop VZEROUPPER -mulAvxTwo_7x10_end: +mulAvxTwo_9x4_end: RET -// func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_8x1(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_9x4Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 20 YMM used + // Full registers estimated 81 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_8x1_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX + JZ mulAvxTwo_9x4Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX MOVQ out_base+48(FP), R12 - MOVQ (R12), R12 - MOVQ start+72(FP), R13 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP // Add start offset to output - ADDQ R13, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X1 - VPBROADCASTB X1, Y1 + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + MOVQ n+80(FP), BP + SHRQ $0x05, BP -mulAvxTwo_8x1_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 +mulAvxTwo_9x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R13), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R14), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R15), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y7 ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y7 ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y7 ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y7 ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y7 ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y7 ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y7 ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y7 + ADDQ $0x20, AX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2048(CX), Y5 + VMOVDQU 2080(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 2112(CX), Y5 + VMOVDQU 2144(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 2176(CX), Y5 + VMOVDQU 2208(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 2240(CX), Y5 + VMOVDQU 2272(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Store 1 outputs - VMOVDQU Y0, (R12) + // Store 4 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + VMOVDQU Y2, (R15) + ADDQ $0x20, R15 + VMOVDQU Y3, (R12) ADDQ $0x20, R12 // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_8x1_loop + DECQ BP + JNZ mulAvxTwo_9x4Xor_loop VZEROUPPER -mulAvxTwo_8x1_end: +mulAvxTwo_9x4Xor_end: RET -// func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_8x1_64(SB), $0-88 +TEXT ·mulAvxTwo_9x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 20 YMM used + // Full registers estimated 100 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX + SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_8x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), AX - MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 - MOVQ start+72(FP), R12 + JZ mulAvxTwo_9x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 // Add start offset to input - ADDQ R12, DX - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, AX - MOVQ $0x0000000f, R13 - MOVQ R13, X2 - VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R13 - SHRQ $0x06, R13 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X5 + VPBROADCASTB X5, Y5 -mulAvxTwo_8x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 +mulAvxTwo_9x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y6 - VMOVDQU 32(SI), Y5 - ADDQ $0x40, SI - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y6 - VMOVDQU 32(DI), Y5 - ADDQ $0x40, DI - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 64 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y6 - VMOVDQU 32(R8), Y5 - ADDQ $0x40, R8 - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 64 bytes from input 5 to 1 outputs - VMOVDQU (R9), Y6 - VMOVDQU 32(R9), Y5 - ADDQ $0x40, R9 - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y8 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 64 bytes from input 6 to 1 outputs - VMOVDQU (R10), Y6 - VMOVDQU 32(R10), Y5 - ADDQ $0x40, R10 - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y8 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 64 bytes from input 7 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2560(CX), Y6 + VMOVDQU 2592(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2624(CX), Y6 + VMOVDQU 2656(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2688(CX), Y6 + VMOVDQU 2720(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2752(CX), Y6 + VMOVDQU 2784(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2816(CX), Y6 + VMOVDQU 2848(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Store 1 outputs - MOVQ (R11), R14 - VMOVDQU Y0, (R14)(R12*1) - VMOVDQU Y1, 32(R14)(R12*1) + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) // Prepare for next loop - ADDQ $0x40, R12 - DECQ R13 - JNZ mulAvxTwo_8x1_64_loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x5_loop VZEROUPPER -mulAvxTwo_8x1_64_end: +mulAvxTwo_9x5_end: RET -// func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_8x2(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_9x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 39 YMM used + // Destination kept on stack + // Full registers estimated 100 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_8x2_end + JZ mulAvxTwo_9x5Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -20126,16 +52474,11 @@ TEXT ·mulAvxTwo_8x2(SB), NOSPLIT, $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R12 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 - // Add start offset to output - ADDQ R14, R13 - ADDQ R14, R12 - // Add start offset to input ADDQ R14, BX ADDQ R14, SI @@ -20144,504 +52487,831 @@ TEXT ·mulAvxTwo_8x2(SB), NOSPLIT, $0-88 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 + ADDQ R14, R12 ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_8x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 + MOVQ $0x0000000f, R15 + MOVQ R15, X5 + VPBROADCASTB X5, Y5 - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 +mulAvxTwo_9x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y8 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y8 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (DX), Y5 + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y8 ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2560(CX), Y6 + VMOVDQU 2592(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2624(CX), Y6 + VMOVDQU 2656(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2688(CX), Y6 + VMOVDQU 2720(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2752(CX), Y6 + VMOVDQU 2784(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2816(CX), Y6 + VMOVDQU 2848(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Store 2 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - VMOVDQU Y1, (R12) - ADDQ $0x20, R12 + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) // Prepare for next loop + ADDQ $0x20, R14 DECQ AX - JNZ mulAvxTwo_8x2_loop + JNZ mulAvxTwo_9x5Xor_loop VZEROUPPER -mulAvxTwo_8x2_end: +mulAvxTwo_9x5Xor_end: RET -// func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_8x2_64(SB), $0-88 +TEXT ·mulAvxTwo_9x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 39 YMM used + // Full registers estimated 119 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX + SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_8x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), AX - MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 - MOVQ start+72(FP), R12 + JZ mulAvxTwo_9x6_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 // Add start offset to input - ADDQ R12, DX - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, AX - MOVQ $0x0000000f, R13 - MOVQ R13, X4 - VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R13 - SHRQ $0x06, R13 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 -mulAvxTwo_8x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 +mulAvxTwo_9x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU (CX), Y5 - VMOVDQU 32(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 64(CX), Y5 - VMOVDQU 96(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 128(CX), Y5 - VMOVDQU 160(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 192(CX), Y5 - VMOVDQU 224(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (SI), Y9 - VMOVDQU 32(SI), Y11 - ADDQ $0x40, SI + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 256(CX), Y5 - VMOVDQU 288(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 320(CX), Y5 - VMOVDQU 352(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 3 to 2 outputs - VMOVDQU (DI), Y9 - VMOVDQU 32(DI), Y11 - ADDQ $0x40, DI + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 384(CX), Y5 - VMOVDQU 416(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 448(CX), Y5 - VMOVDQU 480(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 4 to 2 outputs - VMOVDQU (R8), Y9 - VMOVDQU 32(R8), Y11 - ADDQ $0x40, R8 + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 512(CX), Y5 - VMOVDQU 544(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 576(CX), Y5 - VMOVDQU 608(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 5 to 2 outputs - VMOVDQU (R9), Y9 - VMOVDQU 32(R9), Y11 - ADDQ $0x40, R9 + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y9 + ADDQ $0x20, R11 VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 640(CX), Y5 - VMOVDQU 672(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 704(CX), Y5 - VMOVDQU 736(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 6 to 2 outputs - VMOVDQU (R10), Y9 - VMOVDQU 32(R10), Y11 - ADDQ $0x40, R10 + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y9 + ADDQ $0x20, R12 VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 768(CX), Y5 - VMOVDQU 800(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 832(CX), Y5 - VMOVDQU 864(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 7 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 896(CX), Y5 - VMOVDQU 928(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3072(CX), Y7 + VMOVDQU 3104(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 3136(CX), Y7 + VMOVDQU 3168(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 960(CX), Y5 - VMOVDQU 992(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 3200(CX), Y7 + VMOVDQU 3232(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 3264(CX), Y7 + VMOVDQU 3296(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 + VMOVDQU 3328(CX), Y7 + VMOVDQU 3360(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3392(CX), Y7 + VMOVDQU 3424(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Store 2 outputs - MOVQ (R11), R14 - VMOVDQU Y0, (R14)(R12*1) - VMOVDQU Y1, 32(R14)(R12*1) - MOVQ 24(R11), R14 - VMOVDQU Y2, (R14)(R12*1) - VMOVDQU Y3, 32(R14)(R12*1) + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) // Prepare for next loop - ADDQ $0x40, R12 - DECQ R13 - JNZ mulAvxTwo_8x2_64_loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x6_loop VZEROUPPER -mulAvxTwo_8x2_64_end: +mulAvxTwo_9x6_end: RET -// func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_8x3(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_9x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 56 YMM used + // Destination kept on stack + // Full registers estimated 119 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_8x3_end + JZ mulAvxTwo_9x6Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -20650,660 +53320,459 @@ TEXT ·mulAvxTwo_8x3(SB), NOSPLIT, $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R14 - MOVQ 48(R12), R12 - MOVQ start+72(FP), R15 - - // Add start offset to output - ADDQ R15, R13 - ADDQ R15, R14 - ADDQ R15, R12 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_8x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Store 3 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - VMOVDQU Y1, (R14) - ADDQ $0x20, R14 - VMOVDQU Y2, (R12) - ADDQ $0x20, R12 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_8x3_loop - VZEROUPPER - -mulAvxTwo_8x3_end: - RET - -// func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_8x3_64(SB), $0-88 - // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 56 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulAvxTwo_8x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), AX - MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 - MOVQ start+72(FP), R12 - - // Add start offset to input - ADDQ R12, DX - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, AX - MOVQ $0x0000000f, R13 - MOVQ R13, X6 - VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R13 - SHRQ $0x06, R13 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 -mulAvxTwo_8x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 +mulAvxTwo_9x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y1, Y1 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 - - // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + VPXOR Y7, Y2, Y2 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y3, Y3 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y4, Y4 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y11 - VMOVDQU 32(SI), Y13 - ADDQ $0x40, SI - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 - - // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y11 - VMOVDQU 32(DI), Y13 - ADDQ $0x40, DI - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y11 - VMOVDQU 32(R8), Y13 - ADDQ $0x40, R8 - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 - - // Load and process 64 bytes from input 5 to 3 outputs - VMOVDQU (R9), Y11 - VMOVDQU 32(R9), Y13 - ADDQ $0x40, R9 - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 6 to 3 outputs - VMOVDQU (R10), Y11 - VMOVDQU 32(R10), Y13 - ADDQ $0x40, R10 - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 - - // Load and process 64 bytes from input 7 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y9 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y9 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3072(CX), Y7 + VMOVDQU 3104(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 - VMOVDQU 1408(CX), Y7 - VMOVDQU 1440(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VMOVDQU 3136(CX), Y7 + VMOVDQU 3168(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 3200(CX), Y7 + VMOVDQU 3232(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 - VMOVDQU 1472(CX), Y7 - VMOVDQU 1504(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VMOVDQU 3264(CX), Y7 + VMOVDQU 3296(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 3328(CX), Y7 + VMOVDQU 3360(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VMOVDQU 3392(CX), Y7 + VMOVDQU 3424(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Store 3 outputs - MOVQ (R11), R14 - VMOVDQU Y0, (R14)(R12*1) - VMOVDQU Y1, 32(R14)(R12*1) - MOVQ 24(R11), R14 - VMOVDQU Y2, (R14)(R12*1) - VMOVDQU Y3, 32(R14)(R12*1) - MOVQ 48(R11), R14 - VMOVDQU Y4, (R14)(R12*1) - VMOVDQU Y5, 32(R14)(R12*1) - - // Prepare for next loop - ADDQ $0x40, R12 - DECQ R13 - JNZ mulAvxTwo_8x3_64_loop + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x6Xor_loop VZEROUPPER -mulAvxTwo_8x3_64_end: +mulAvxTwo_9x6Xor_end: RET -// func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_8x4(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_9x7(SB), NOSPLIT, $0-88 // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 73 YMM used + // Destination kept on stack + // Full registers estimated 138 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_8x4_end + JZ mulAvxTwo_9x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -21312,691 +53781,1015 @@ TEXT ·mulAvxTwo_8x4(SB), NOSPLIT, $8-88 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R14 - MOVQ 48(R12), R15 - MOVQ 72(R12), R12 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R13 - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R12 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X4 - VPBROADCASTB X4, Y4 - -mulAvxTwo_8x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X7 + VPBROADCASTB X7, Y7 - // Load and process 32 bytes from input 0 to 4 outputs - VMOVDQU (BX), Y7 +mulAvxTwo_9x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 ADDQ $0x20, BX - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU (CX), Y5 - VMOVDQU 32(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 64(CX), Y5 - VMOVDQU 96(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 128(CX), Y5 - VMOVDQU 160(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 192(CX), Y5 - VMOVDQU 224(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 - // Load and process 32 bytes from input 1 to 4 outputs - VMOVDQU (SI), Y7 + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 ADDQ $0x20, SI - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 256(CX), Y5 - VMOVDQU 288(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 320(CX), Y5 - VMOVDQU 352(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 384(CX), Y5 - VMOVDQU 416(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 448(CX), Y5 - VMOVDQU 480(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 2 to 4 outputs - VMOVDQU (DI), Y7 + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 ADDQ $0x20, DI - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 512(CX), Y5 - VMOVDQU 544(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 576(CX), Y5 - VMOVDQU 608(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 640(CX), Y5 - VMOVDQU 672(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 704(CX), Y5 - VMOVDQU 736(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 3 to 4 outputs - VMOVDQU (R8), Y7 + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 ADDQ $0x20, R8 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 768(CX), Y5 - VMOVDQU 800(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 832(CX), Y5 - VMOVDQU 864(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 896(CX), Y5 - VMOVDQU 928(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 960(CX), Y5 - VMOVDQU 992(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 4 to 4 outputs - VMOVDQU (R9), Y7 + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 ADDQ $0x20, R9 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1024(CX), Y5 - VMOVDQU 1056(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1088(CX), Y5 - VMOVDQU 1120(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1152(CX), Y5 - VMOVDQU 1184(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 1216(CX), Y5 - VMOVDQU 1248(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 5 to 4 outputs - VMOVDQU (R10), Y7 + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 ADDQ $0x20, R10 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1280(CX), Y5 - VMOVDQU 1312(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1344(CX), Y5 - VMOVDQU 1376(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1408(CX), Y5 - VMOVDQU 1440(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 1472(CX), Y5 - VMOVDQU 1504(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 6 to 4 outputs - VMOVDQU (R11), Y7 + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 ADDQ $0x20, R11 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1536(CX), Y5 - VMOVDQU 1568(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1600(CX), Y5 - VMOVDQU 1632(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1664(CX), Y5 - VMOVDQU 1696(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 1728(CX), Y5 - VMOVDQU 1760(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 7 to 4 outputs - VMOVDQU (DX), Y7 + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y10 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y10 ADDQ $0x20, DX - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1792(CX), Y5 - VMOVDQU 1824(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1856(CX), Y5 - VMOVDQU 1888(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1920(CX), Y5 - VMOVDQU 1952(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 1984(CX), Y5 - VMOVDQU 2016(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3584(CX), Y8 + VMOVDQU 3616(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3648(CX), Y8 + VMOVDQU 3680(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3712(CX), Y8 + VMOVDQU 3744(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3776(CX), Y8 + VMOVDQU 3808(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3840(CX), Y8 + VMOVDQU 3872(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3904(CX), Y8 + VMOVDQU 3936(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3968(CX), Y8 + VMOVDQU 4000(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Store 4 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - VMOVDQU Y1, (R14) - ADDQ $0x20, R14 - VMOVDQU Y2, (R15) - ADDQ $0x20, R15 - VMOVDQU Y3, (R12) - ADDQ $0x20, R12 + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) // Prepare for next loop + ADDQ $0x20, R14 DECQ AX - JNZ mulAvxTwo_8x4_loop + JNZ mulAvxTwo_9x7_loop VZEROUPPER -mulAvxTwo_8x4_end: +mulAvxTwo_9x7_end: RET -// func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_8x5(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_9x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 90 YMM used + // Destination kept on stack + // Full registers estimated 138 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_8x5_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), AX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R13 - MOVQ 48(R11), R14 - MOVQ 72(R11), R15 - MOVQ 96(R11), R11 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R12 - ADDQ BP, R13 - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R11 + JZ mulAvxTwo_9x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 // Add start offset to input - ADDQ BP, DX - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, AX - MOVQ $0x0000000f, BP - MOVQ BP, X5 - VPBROADCASTB X5, Y5 - MOVQ n+80(FP), BP - SHRQ $0x05, BP - -mulAvxTwo_8x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - - // Load and process 32 bytes from input 0 to 5 outputs - VMOVDQU (DX), Y8 - ADDQ $0x20, DX - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU (CX), Y6 - VMOVDQU 32(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 64(CX), Y6 - VMOVDQU 96(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 128(CX), Y6 - VMOVDQU 160(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 256(CX), Y6 - VMOVDQU 288(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X7 + VPBROADCASTB X7, Y7 - // Load and process 32 bytes from input 1 to 5 outputs - VMOVDQU (BX), Y8 +mulAvxTwo_9x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 ADDQ $0x20, BX - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 320(CX), Y6 - VMOVDQU 352(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 384(CX), Y6 - VMOVDQU 416(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 448(CX), Y6 - VMOVDQU 480(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 512(CX), Y6 - VMOVDQU 544(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 576(CX), Y6 - VMOVDQU 608(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 - - // Load and process 32 bytes from input 2 to 5 outputs - VMOVDQU (SI), Y8 - ADDQ $0x20, SI - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 640(CX), Y6 - VMOVDQU 672(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 704(CX), Y6 - VMOVDQU 736(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 768(CX), Y6 - VMOVDQU 800(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 832(CX), Y6 - VMOVDQU 864(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 896(CX), Y6 - VMOVDQU 928(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 3 to 5 outputs - VMOVDQU (DI), Y8 + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 ADDQ $0x20, DI - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 960(CX), Y6 - VMOVDQU 992(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1024(CX), Y6 - VMOVDQU 1056(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1088(CX), Y6 - VMOVDQU 1120(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1152(CX), Y6 - VMOVDQU 1184(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1216(CX), Y6 - VMOVDQU 1248(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 4 to 5 outputs - VMOVDQU (R8), Y8 + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 ADDQ $0x20, R8 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 1280(CX), Y6 - VMOVDQU 1312(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1344(CX), Y6 - VMOVDQU 1376(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1408(CX), Y6 - VMOVDQU 1440(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1472(CX), Y6 - VMOVDQU 1504(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1536(CX), Y6 - VMOVDQU 1568(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 5 to 5 outputs - VMOVDQU (R9), Y8 + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 ADDQ $0x20, R9 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 1600(CX), Y6 - VMOVDQU 1632(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1664(CX), Y6 - VMOVDQU 1696(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1728(CX), Y6 - VMOVDQU 1760(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1792(CX), Y6 - VMOVDQU 1824(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1856(CX), Y6 - VMOVDQU 1888(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 6 to 5 outputs - VMOVDQU (R10), Y8 + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 ADDQ $0x20, R10 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 1920(CX), Y6 - VMOVDQU 1952(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1984(CX), Y6 - VMOVDQU 2016(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 2048(CX), Y6 - VMOVDQU 2080(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 2112(CX), Y6 - VMOVDQU 2144(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 2176(CX), Y6 - VMOVDQU 2208(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 7 to 5 outputs - VMOVDQU (AX), Y8 - ADDQ $0x20, AX - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 2240(CX), Y6 - VMOVDQU 2272(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 2304(CX), Y6 - VMOVDQU 2336(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 2368(CX), Y6 - VMOVDQU 2400(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 2432(CX), Y6 - VMOVDQU 2464(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 2496(CX), Y6 - VMOVDQU 2528(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y10 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Store 5 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - VMOVDQU Y1, (R13) - ADDQ $0x20, R13 - VMOVDQU Y2, (R14) - ADDQ $0x20, R14 - VMOVDQU Y3, (R15) - ADDQ $0x20, R15 - VMOVDQU Y4, (R11) - ADDQ $0x20, R11 + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3584(CX), Y8 + VMOVDQU 3616(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3648(CX), Y8 + VMOVDQU 3680(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3712(CX), Y8 + VMOVDQU 3744(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3776(CX), Y8 + VMOVDQU 3808(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3840(CX), Y8 + VMOVDQU 3872(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3904(CX), Y8 + VMOVDQU 3936(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3968(CX), Y8 + VMOVDQU 4000(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) // Prepare for next loop - DECQ BP - JNZ mulAvxTwo_8x5_loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x7Xor_loop VZEROUPPER -mulAvxTwo_8x5_end: +mulAvxTwo_9x7Xor_end: RET -// func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_8x6(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_9x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 107 YMM used + // Full registers estimated 157 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_8x6_end + JZ mulAvxTwo_9x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -22005,410 +54798,551 @@ TEXT ·mulAvxTwo_8x6(SB), NOSPLIT, $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ start+72(FP), R13 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X6 - VPBROADCASTB X6, Y6 - -mulAvxTwo_8x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X8 + VPBROADCASTB X8, Y8 - // Load and process 32 bytes from input 0 to 6 outputs - VMOVDQU (BX), Y9 +mulAvxTwo_9x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 ADDQ $0x20, BX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU (CX), Y7 - VMOVDQU 32(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 64(CX), Y7 - VMOVDQU 96(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 128(CX), Y7 - VMOVDQU 160(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 192(CX), Y7 - VMOVDQU 224(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 256(CX), Y7 - VMOVDQU 288(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 320(CX), Y7 - VMOVDQU 352(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 - // Load and process 32 bytes from input 1 to 6 outputs - VMOVDQU (SI), Y9 + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 ADDQ $0x20, SI - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 384(CX), Y7 - VMOVDQU 416(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 448(CX), Y7 - VMOVDQU 480(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 512(CX), Y7 - VMOVDQU 544(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 576(CX), Y7 - VMOVDQU 608(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 640(CX), Y7 - VMOVDQU 672(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 704(CX), Y7 - VMOVDQU 736(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 2 to 6 outputs - VMOVDQU (DI), Y9 + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 ADDQ $0x20, DI - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 768(CX), Y7 - VMOVDQU 800(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 832(CX), Y7 - VMOVDQU 864(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 896(CX), Y7 - VMOVDQU 928(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 960(CX), Y7 - VMOVDQU 992(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 1024(CX), Y7 - VMOVDQU 1056(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 1088(CX), Y7 - VMOVDQU 1120(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 3 to 6 outputs - VMOVDQU (R8), Y9 + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 ADDQ $0x20, R8 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 1152(CX), Y7 - VMOVDQU 1184(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 1216(CX), Y7 - VMOVDQU 1248(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 1280(CX), Y7 - VMOVDQU 1312(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 1344(CX), Y7 - VMOVDQU 1376(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 1408(CX), Y7 - VMOVDQU 1440(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 1472(CX), Y7 - VMOVDQU 1504(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 - - // Load and process 32 bytes from input 4 to 6 outputs - VMOVDQU (R9), Y9 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 1536(CX), Y7 - VMOVDQU 1568(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 1600(CX), Y7 - VMOVDQU 1632(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 1664(CX), Y7 - VMOVDQU 1696(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 1728(CX), Y7 - VMOVDQU 1760(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 1792(CX), Y7 - VMOVDQU 1824(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 1856(CX), Y7 - VMOVDQU 1888(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 5 to 6 outputs - VMOVDQU (R10), Y9 + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 ADDQ $0x20, R10 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 1920(CX), Y7 - VMOVDQU 1952(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 1984(CX), Y7 - VMOVDQU 2016(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 2048(CX), Y7 - VMOVDQU 2080(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 2112(CX), Y7 - VMOVDQU 2144(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 2176(CX), Y7 - VMOVDQU 2208(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 2240(CX), Y7 - VMOVDQU 2272(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 6 to 6 outputs - VMOVDQU (R11), Y9 + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 ADDQ $0x20, R11 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 2304(CX), Y7 - VMOVDQU 2336(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 2368(CX), Y7 - VMOVDQU 2400(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 2432(CX), Y7 - VMOVDQU 2464(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 2496(CX), Y7 - VMOVDQU 2528(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 2560(CX), Y7 - VMOVDQU 2592(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 2624(CX), Y7 - VMOVDQU 2656(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 7 to 6 outputs - VMOVDQU (DX), Y9 + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y11 ADDQ $0x20, DX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 2688(CX), Y7 - VMOVDQU 2720(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 2752(CX), Y7 - VMOVDQU 2784(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 2816(CX), Y7 - VMOVDQU 2848(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 2880(CX), Y7 - VMOVDQU 2912(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 2944(CX), Y7 - VMOVDQU 2976(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 3008(CX), Y7 - VMOVDQU 3040(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4096(CX), Y9 + VMOVDQU 4128(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 4160(CX), Y9 + VMOVDQU 4192(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 4224(CX), Y9 + VMOVDQU 4256(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 4288(CX), Y9 + VMOVDQU 4320(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 4352(CX), Y9 + VMOVDQU 4384(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 4416(CX), Y9 + VMOVDQU 4448(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 4480(CX), Y9 + VMOVDQU 4512(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4544(CX), Y9 + VMOVDQU 4576(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Store 6 outputs - MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) - MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) - MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) - MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) - MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) - MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) // Prepare for next loop - ADDQ $0x20, R13 + ADDQ $0x20, R14 DECQ AX - JNZ mulAvxTwo_8x6_loop + JNZ mulAvxTwo_9x8_loop VZEROUPPER -mulAvxTwo_8x6_end: +mulAvxTwo_9x8_end: RET -// func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_8x7(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_9x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 124 YMM used + // Full registers estimated 157 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_8x7_end + JZ mulAvxTwo_9x8Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -22417,461 +55351,1183 @@ TEXT ·mulAvxTwo_8x7(SB), NOSPLIT, $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ start+72(FP), R13 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X7 - VPBROADCASTB X7, Y7 - -mulAvxTwo_8x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X8 + VPBROADCASTB X8, Y8 - // Load and process 32 bytes from input 0 to 7 outputs - VMOVDQU (BX), Y10 +mulAvxTwo_9x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 ADDQ $0x20, BX - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU (CX), Y8 - VMOVDQU 32(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 64(CX), Y8 - VMOVDQU 96(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 128(CX), Y8 - VMOVDQU 160(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 192(CX), Y8 - VMOVDQU 224(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 320(CX), Y8 - VMOVDQU 352(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 384(CX), Y8 - VMOVDQU 416(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 1 to 7 outputs - VMOVDQU (SI), Y10 + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 ADDQ $0x20, SI - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 448(CX), Y8 - VMOVDQU 480(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 512(CX), Y8 - VMOVDQU 544(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 576(CX), Y8 - VMOVDQU 608(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 640(CX), Y8 - VMOVDQU 672(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 704(CX), Y8 - VMOVDQU 736(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 768(CX), Y8 - VMOVDQU 800(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 832(CX), Y8 - VMOVDQU 864(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 2 to 7 outputs - VMOVDQU (DI), Y10 + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 ADDQ $0x20, DI - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 896(CX), Y8 - VMOVDQU 928(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 960(CX), Y8 - VMOVDQU 992(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1024(CX), Y8 - VMOVDQU 1056(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1088(CX), Y8 - VMOVDQU 1120(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 1152(CX), Y8 - VMOVDQU 1184(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 1216(CX), Y8 - VMOVDQU 1248(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 1280(CX), Y8 - VMOVDQU 1312(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 3 to 7 outputs - VMOVDQU (R8), Y10 + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 ADDQ $0x20, R8 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 1344(CX), Y8 - VMOVDQU 1376(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 1408(CX), Y8 - VMOVDQU 1440(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1472(CX), Y8 - VMOVDQU 1504(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1536(CX), Y8 - VMOVDQU 1568(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 1600(CX), Y8 - VMOVDQU 1632(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 1664(CX), Y8 - VMOVDQU 1696(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 1728(CX), Y8 - VMOVDQU 1760(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 4 to 7 outputs - VMOVDQU (R9), Y10 + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 ADDQ $0x20, R9 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 1792(CX), Y8 - VMOVDQU 1824(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 1856(CX), Y8 - VMOVDQU 1888(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1920(CX), Y8 - VMOVDQU 1952(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1984(CX), Y8 - VMOVDQU 2016(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 2048(CX), Y8 - VMOVDQU 2080(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 2112(CX), Y8 - VMOVDQU 2144(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 2176(CX), Y8 - VMOVDQU 2208(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 5 to 7 outputs - VMOVDQU (R10), Y10 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 2240(CX), Y8 - VMOVDQU 2272(CX), Y9 - VPSHUFB Y10, Y8, Y8 + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4096(CX), Y9 + VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 2304(CX), Y8 - VMOVDQU 2336(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 4160(CX), Y9 + VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 2368(CX), Y8 - VMOVDQU 2400(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 4224(CX), Y9 + VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 2432(CX), Y8 - VMOVDQU 2464(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 4288(CX), Y9 + VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 2496(CX), Y8 - VMOVDQU 2528(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 4352(CX), Y9 + VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 2560(CX), Y8 - VMOVDQU 2592(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 4416(CX), Y9 + VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 2624(CX), Y8 - VMOVDQU 2656(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 4480(CX), Y9 + VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4544(CX), Y9 + VMOVDQU 4576(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x8Xor_loop + VZEROUPPER + +mulAvxTwo_9x8Xor_end: + RET + +// func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x9(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 176 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x9_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_9x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 6 to 7 outputs - VMOVDQU (R11), Y10 + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 ADDQ $0x20, R11 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 2688(CX), Y8 - VMOVDQU 2720(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 2752(CX), Y8 - VMOVDQU 2784(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 2816(CX), Y8 - VMOVDQU 2848(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 2880(CX), Y8 - VMOVDQU 2912(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 2944(CX), Y8 - VMOVDQU 2976(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 3008(CX), Y8 - VMOVDQU 3040(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 3072(CX), Y8 - VMOVDQU 3104(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 7 to 7 outputs - VMOVDQU (DX), Y10 + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y12 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y12 ADDQ $0x20, DX - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 3136(CX), Y8 - VMOVDQU 3168(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 3200(CX), Y8 - VMOVDQU 3232(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 3264(CX), Y8 - VMOVDQU 3296(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 3328(CX), Y8 - VMOVDQU 3360(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 3392(CX), Y8 - VMOVDQU 3424(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 3456(CX), Y8 - VMOVDQU 3488(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 3520(CX), Y8 - VMOVDQU 3552(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4608(CX), Y10 + VMOVDQU 4640(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 4672(CX), Y10 + VMOVDQU 4704(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 4736(CX), Y10 + VMOVDQU 4768(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 4800(CX), Y10 + VMOVDQU 4832(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 4864(CX), Y10 + VMOVDQU 4896(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 4928(CX), Y10 + VMOVDQU 4960(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 4992(CX), Y10 + VMOVDQU 5024(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 5056(CX), Y10 + VMOVDQU 5088(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 5120(CX), Y10 + VMOVDQU 5152(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Store 7 outputs - MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) - MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) - MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) - MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) - MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) - MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) - MOVQ 144(R12), R14 - VMOVDQU Y6, (R14)(R13*1) + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y8, (R15)(R14*1) // Prepare for next loop - ADDQ $0x20, R13 + ADDQ $0x20, R14 DECQ AX - JNZ mulAvxTwo_8x7_loop + JNZ mulAvxTwo_9x9_loop VZEROUPPER -mulAvxTwo_8x7_end: +mulAvxTwo_9x9_end: RET -// func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_8x8(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_9x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 141 YMM used + // Full registers estimated 176 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_8x8_end + JZ mulAvxTwo_9x9Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -22880,1077 +56536,1296 @@ TEXT ·mulAvxTwo_8x8(SB), NOSPLIT, $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ start+72(FP), R13 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X8 - VPBROADCASTB X8, Y8 - -mulAvxTwo_8x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X9 + VPBROADCASTB X9, Y9 - // Load and process 32 bytes from input 0 to 8 outputs - VMOVDQU (BX), Y11 +mulAvxTwo_9x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 ADDQ $0x20, BX - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU (CX), Y9 - VMOVDQU 32(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 64(CX), Y9 - VMOVDQU 96(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 128(CX), Y9 - VMOVDQU 160(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 192(CX), Y9 - VMOVDQU 224(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 256(CX), Y9 - VMOVDQU 288(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 320(CX), Y9 - VMOVDQU 352(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 384(CX), Y9 - VMOVDQU 416(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 448(CX), Y9 - VMOVDQU 480(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 1 to 8 outputs - VMOVDQU (SI), Y11 + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 ADDQ $0x20, SI - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 512(CX), Y9 - VMOVDQU 544(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 576(CX), Y9 - VMOVDQU 608(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 640(CX), Y9 - VMOVDQU 672(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 704(CX), Y9 - VMOVDQU 736(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 768(CX), Y9 - VMOVDQU 800(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 832(CX), Y9 - VMOVDQU 864(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 896(CX), Y9 - VMOVDQU 928(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 960(CX), Y9 - VMOVDQU 992(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 2 to 8 outputs - VMOVDQU (DI), Y11 + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 ADDQ $0x20, DI - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 1024(CX), Y9 - VMOVDQU 1056(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 1088(CX), Y9 - VMOVDQU 1120(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 1152(CX), Y9 - VMOVDQU 1184(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 1216(CX), Y9 - VMOVDQU 1248(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 1280(CX), Y9 - VMOVDQU 1312(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 1344(CX), Y9 - VMOVDQU 1376(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 1408(CX), Y9 - VMOVDQU 1440(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 1472(CX), Y9 - VMOVDQU 1504(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 3 to 8 outputs - VMOVDQU (R8), Y11 + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 ADDQ $0x20, R8 - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 1536(CX), Y9 - VMOVDQU 1568(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 1600(CX), Y9 - VMOVDQU 1632(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 1664(CX), Y9 - VMOVDQU 1696(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 1728(CX), Y9 - VMOVDQU 1760(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 1792(CX), Y9 - VMOVDQU 1824(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 1856(CX), Y9 - VMOVDQU 1888(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 1920(CX), Y9 - VMOVDQU 1952(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 1984(CX), Y9 - VMOVDQU 2016(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 4 to 8 outputs - VMOVDQU (R9), Y11 + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 ADDQ $0x20, R9 - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 2048(CX), Y9 - VMOVDQU 2080(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 2112(CX), Y9 - VMOVDQU 2144(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 2176(CX), Y9 - VMOVDQU 2208(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 2240(CX), Y9 - VMOVDQU 2272(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 2304(CX), Y9 - VMOVDQU 2336(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 2368(CX), Y9 - VMOVDQU 2400(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 2432(CX), Y9 - VMOVDQU 2464(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 2496(CX), Y9 - VMOVDQU 2528(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 5 to 8 outputs - VMOVDQU (R10), Y11 + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 ADDQ $0x20, R10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 2560(CX), Y9 - VMOVDQU 2592(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 2624(CX), Y9 - VMOVDQU 2656(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 2688(CX), Y9 - VMOVDQU 2720(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 2752(CX), Y9 - VMOVDQU 2784(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 2816(CX), Y9 - VMOVDQU 2848(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 2880(CX), Y9 - VMOVDQU 2912(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 2944(CX), Y9 - VMOVDQU 2976(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 3008(CX), Y9 - VMOVDQU 3040(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - - // Load and process 32 bytes from input 6 to 8 outputs - VMOVDQU (R11), Y11 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 3072(CX), Y9 - VMOVDQU 3104(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 3136(CX), Y9 - VMOVDQU 3168(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 3200(CX), Y9 - VMOVDQU 3232(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 3264(CX), Y9 - VMOVDQU 3296(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 3328(CX), Y9 - VMOVDQU 3360(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 3392(CX), Y9 - VMOVDQU 3424(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 3456(CX), Y9 - VMOVDQU 3488(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 3520(CX), Y9 - VMOVDQU 3552(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 7 to 8 outputs - VMOVDQU (DX), Y11 - ADDQ $0x20, DX - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 3584(CX), Y9 - VMOVDQU 3616(CX), Y10 - VPSHUFB Y11, Y9, Y9 + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 3648(CX), Y9 - VMOVDQU 3680(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 3712(CX), Y9 - VMOVDQU 3744(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 3776(CX), Y9 - VMOVDQU 3808(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 3840(CX), Y9 - VMOVDQU 3872(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 3904(CX), Y9 - VMOVDQU 3936(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 3968(CX), Y9 - VMOVDQU 4000(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 4032(CX), Y9 - VMOVDQU 4064(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - - // Store 8 outputs - MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) - MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) - MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) - MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) - MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) - MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) - MOVQ 144(R12), R14 - VMOVDQU Y6, (R14)(R13*1) - MOVQ 168(R12), R14 - VMOVDQU Y7, (R14)(R13*1) - - // Prepare for next loop - ADDQ $0x20, R13 - DECQ AX - JNZ mulAvxTwo_8x8_loop - VZEROUPPER - -mulAvxTwo_8x8_end: - RET - -// func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_8x9(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 158 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x9_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ start+72(FP), R13 - - // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X9 - VPBROADCASTB X9, Y9 - -mulAvxTwo_8x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 0 to 9 outputs - VMOVDQU (BX), Y12 - ADDQ $0x20, BX + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y12 + ADDQ $0x20, R12 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 - VMOVDQU (CX), Y10 - VMOVDQU 32(CX), Y11 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 - VMOVDQU 64(CX), Y10 - VMOVDQU 96(CX), Y11 + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 - VMOVDQU 128(CX), Y10 - VMOVDQU 160(CX), Y11 + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 - VMOVDQU 192(CX), Y10 - VMOVDQU 224(CX), Y11 + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 - VMOVDQU 256(CX), Y10 - VMOVDQU 288(CX), Y11 + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 - VMOVDQU 320(CX), Y10 - VMOVDQU 352(CX), Y11 + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 - VMOVDQU 384(CX), Y10 - VMOVDQU 416(CX), Y11 + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 - VMOVDQU 448(CX), Y10 - VMOVDQU 480(CX), Y11 + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 - VMOVDQU 512(CX), Y10 - VMOVDQU 544(CX), Y11 + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 1 to 9 outputs - VMOVDQU (SI), Y12 - ADDQ $0x20, SI + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 - VMOVDQU 576(CX), Y10 - VMOVDQU 608(CX), Y11 + VMOVDQU 4608(CX), Y10 + VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 - VMOVDQU 640(CX), Y10 - VMOVDQU 672(CX), Y11 + VMOVDQU 4672(CX), Y10 + VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 - VMOVDQU 704(CX), Y10 - VMOVDQU 736(CX), Y11 + VMOVDQU 4736(CX), Y10 + VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 - VMOVDQU 768(CX), Y10 - VMOVDQU 800(CX), Y11 + VMOVDQU 4800(CX), Y10 + VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 - VMOVDQU 832(CX), Y10 - VMOVDQU 864(CX), Y11 + VMOVDQU 4864(CX), Y10 + VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 - VMOVDQU 896(CX), Y10 - VMOVDQU 928(CX), Y11 + VMOVDQU 4928(CX), Y10 + VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 - VMOVDQU 960(CX), Y10 - VMOVDQU 992(CX), Y11 + VMOVDQU 4992(CX), Y10 + VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 - VMOVDQU 1024(CX), Y10 - VMOVDQU 1056(CX), Y11 + VMOVDQU 5056(CX), Y10 + VMOVDQU 5088(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 5120(CX), Y10 + VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 1088(CX), Y10 - VMOVDQU 1120(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x9Xor_loop + VZEROUPPER + +mulAvxTwo_9x9Xor_end: + RET + +// func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x10(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 195 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_9x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 2 to 9 outputs - VMOVDQU (DI), Y12 - ADDQ $0x20, DI - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 1152(CX), Y10 - VMOVDQU 1184(CX), Y11 - VPSHUFB Y12, Y10, Y10 + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 1216(CX), Y10 - VMOVDQU 1248(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 1280(CX), Y10 - VMOVDQU 1312(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 1344(CX), Y10 - VMOVDQU 1376(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 1408(CX), Y10 - VMOVDQU 1440(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 1472(CX), Y10 - VMOVDQU 1504(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 1536(CX), Y10 - VMOVDQU 1568(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 1600(CX), Y10 - VMOVDQU 1632(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 1664(CX), Y10 - VMOVDQU 1696(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 3 to 9 outputs - VMOVDQU (R8), Y12 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 1728(CX), Y10 - VMOVDQU 1760(CX), Y11 - VPSHUFB Y12, Y10, Y10 + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 1792(CX), Y10 - VMOVDQU 1824(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 1856(CX), Y10 - VMOVDQU 1888(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 1920(CX), Y10 - VMOVDQU 1952(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 1984(CX), Y10 - VMOVDQU 2016(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 2048(CX), Y10 - VMOVDQU 2080(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 2112(CX), Y10 - VMOVDQU 2144(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 2176(CX), Y10 - VMOVDQU 2208(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 2240(CX), Y10 - VMOVDQU 2272(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 4 to 9 outputs - VMOVDQU (R9), Y12 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 2304(CX), Y10 - VMOVDQU 2336(CX), Y11 - VPSHUFB Y12, Y10, Y10 + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 2368(CX), Y10 - VMOVDQU 2400(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 2432(CX), Y10 - VMOVDQU 2464(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 2496(CX), Y10 - VMOVDQU 2528(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 2560(CX), Y10 - VMOVDQU 2592(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 2624(CX), Y10 - VMOVDQU 2656(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 2688(CX), Y10 - VMOVDQU 2720(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 2752(CX), Y10 - VMOVDQU 2784(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 2816(CX), Y10 - VMOVDQU 2848(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 5 to 9 outputs - VMOVDQU (R10), Y12 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 2880(CX), Y10 - VMOVDQU 2912(CX), Y11 - VPSHUFB Y12, Y10, Y10 + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y13 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 2944(CX), Y10 - VMOVDQU 2976(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 3008(CX), Y10 - VMOVDQU 3040(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 3072(CX), Y10 - VMOVDQU 3104(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 3136(CX), Y10 - VMOVDQU 3168(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 3200(CX), Y10 - VMOVDQU 3232(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 3264(CX), Y10 - VMOVDQU 3296(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 3328(CX), Y10 - VMOVDQU 3360(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 3392(CX), Y10 - VMOVDQU 3424(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 6 to 9 outputs - VMOVDQU (R11), Y12 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 3456(CX), Y10 - VMOVDQU 3488(CX), Y11 - VPSHUFB Y12, Y10, Y10 + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y13 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 4480(CX), Y11 + VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 3520(CX), Y10 - VMOVDQU 3552(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 4544(CX), Y11 + VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 3584(CX), Y10 - VMOVDQU 3616(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 4608(CX), Y11 + VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 3648(CX), Y10 - VMOVDQU 3680(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4672(CX), Y11 + VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 3712(CX), Y10 - VMOVDQU 3744(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4736(CX), Y11 + VMOVDQU 4768(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4800(CX), Y11 + VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 3776(CX), Y10 - VMOVDQU 3808(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4864(CX), Y11 + VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 3840(CX), Y10 - VMOVDQU 3872(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4928(CX), Y11 + VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 3904(CX), Y10 - VMOVDQU 3936(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4992(CX), Y11 + VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 3968(CX), Y10 - VMOVDQU 4000(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 5056(CX), Y11 + VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 7 to 9 outputs - VMOVDQU (DX), Y12 + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y13 ADDQ $0x20, DX - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 4032(CX), Y10 - VMOVDQU 4064(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 5120(CX), Y11 + VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 4096(CX), Y10 - VMOVDQU 4128(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 5184(CX), Y11 + VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 4160(CX), Y10 - VMOVDQU 4192(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 5248(CX), Y11 + VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 4224(CX), Y10 - VMOVDQU 4256(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 5312(CX), Y11 + VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 4288(CX), Y10 - VMOVDQU 4320(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 5376(CX), Y11 + VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 4352(CX), Y10 - VMOVDQU 4384(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 5440(CX), Y11 + VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 4416(CX), Y10 - VMOVDQU 4448(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 5504(CX), Y11 + VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 4480(CX), Y10 - VMOVDQU 4512(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 5568(CX), Y11 + VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 4544(CX), Y10 - VMOVDQU 4576(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 5632(CX), Y11 + VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 5696(CX), Y11 + VMOVDQU 5728(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Store 9 outputs - MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) - MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) - MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) - MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) - MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) - MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) - MOVQ 144(R12), R14 - VMOVDQU Y6, (R14)(R13*1) - MOVQ 168(R12), R14 - VMOVDQU Y7, (R14)(R13*1) - MOVQ 192(R12), R14 - VMOVDQU Y8, (R14)(R13*1) + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y9, (R15)(R14*1) // Prepare for next loop - ADDQ $0x20, R13 + ADDQ $0x20, R14 DECQ AX - JNZ mulAvxTwo_8x9_loop + JNZ mulAvxTwo_9x10_loop VZEROUPPER -mulAvxTwo_8x9_end: +mulAvxTwo_9x10_end: RET -// func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_8x10(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_9x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 175 YMM used + // Full registers estimated 195 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_8x10_end + JZ mulAvxTwo_9x10Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -23959,96 +57834,106 @@ TEXT ·mulAvxTwo_8x10(SB), NOSPLIT, $0-88 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ start+72(FP), R13 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X10 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X10 VPBROADCASTB X10, Y10 -mulAvxTwo_8x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 - +mulAvxTwo_9x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 + MOVQ 216(R13), R15 + VMOVDQU (R15)(R14*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 @@ -24104,469 +57989,989 @@ mulAvxTwo_8x10_loop: VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 - VMOVDQU 1088(CX), Y11 - VMOVDQU 1120(CX), Y12 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 - VMOVDQU 1152(CX), Y11 - VMOVDQU 1184(CX), Y12 + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 - VMOVDQU 1216(CX), Y11 - VMOVDQU 1248(CX), Y12 + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 2 to 10 outputs - VMOVDQU (DI), Y13 - ADDQ $0x20, DI + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y13 + ADDQ $0x20, R11 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 - VMOVDQU 1280(CX), Y11 - VMOVDQU 1312(CX), Y12 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 - VMOVDQU 1344(CX), Y11 - VMOVDQU 1376(CX), Y12 + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 - VMOVDQU 1408(CX), Y11 - VMOVDQU 1440(CX), Y12 + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 - VMOVDQU 1472(CX), Y11 - VMOVDQU 1504(CX), Y12 + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 - VMOVDQU 1536(CX), Y11 - VMOVDQU 1568(CX), Y12 + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 - VMOVDQU 1600(CX), Y11 - VMOVDQU 1632(CX), Y12 + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 - VMOVDQU 1664(CX), Y11 - VMOVDQU 1696(CX), Y12 + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 - VMOVDQU 1728(CX), Y11 - VMOVDQU 1760(CX), Y12 + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 - VMOVDQU 1792(CX), Y11 - VMOVDQU 1824(CX), Y12 + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 - VMOVDQU 1856(CX), Y11 - VMOVDQU 1888(CX), Y12 + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 3 to 10 outputs - VMOVDQU (R8), Y13 - ADDQ $0x20, R8 + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y13 + ADDQ $0x20, R12 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 - VMOVDQU 1920(CX), Y11 - VMOVDQU 1952(CX), Y12 + VMOVDQU 4480(CX), Y11 + VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 - VMOVDQU 1984(CX), Y11 - VMOVDQU 2016(CX), Y12 + VMOVDQU 4544(CX), Y11 + VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 - VMOVDQU 2048(CX), Y11 - VMOVDQU 2080(CX), Y12 + VMOVDQU 4608(CX), Y11 + VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 - VMOVDQU 2112(CX), Y11 - VMOVDQU 2144(CX), Y12 + VMOVDQU 4672(CX), Y11 + VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 - VMOVDQU 2176(CX), Y11 - VMOVDQU 2208(CX), Y12 + VMOVDQU 4736(CX), Y11 + VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 - VMOVDQU 2240(CX), Y11 - VMOVDQU 2272(CX), Y12 + VMOVDQU 4800(CX), Y11 + VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 - VMOVDQU 2304(CX), Y11 - VMOVDQU 2336(CX), Y12 + VMOVDQU 4864(CX), Y11 + VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 - VMOVDQU 2368(CX), Y11 - VMOVDQU 2400(CX), Y12 + VMOVDQU 4928(CX), Y11 + VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 - VMOVDQU 2432(CX), Y11 - VMOVDQU 2464(CX), Y12 + VMOVDQU 4992(CX), Y11 + VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 - VMOVDQU 2496(CX), Y11 - VMOVDQU 2528(CX), Y12 + VMOVDQU 5056(CX), Y11 + VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 4 to 10 outputs - VMOVDQU (R9), Y13 - ADDQ $0x20, R9 + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 - VMOVDQU 2560(CX), Y11 - VMOVDQU 2592(CX), Y12 + VMOVDQU 5120(CX), Y11 + VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 - VMOVDQU 2624(CX), Y11 - VMOVDQU 2656(CX), Y12 + VMOVDQU 5184(CX), Y11 + VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 - VMOVDQU 2688(CX), Y11 - VMOVDQU 2720(CX), Y12 + VMOVDQU 5248(CX), Y11 + VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 - VMOVDQU 2752(CX), Y11 - VMOVDQU 2784(CX), Y12 + VMOVDQU 5312(CX), Y11 + VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 - VMOVDQU 2816(CX), Y11 - VMOVDQU 2848(CX), Y12 + VMOVDQU 5376(CX), Y11 + VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 - VMOVDQU 2880(CX), Y11 - VMOVDQU 2912(CX), Y12 + VMOVDQU 5440(CX), Y11 + VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 - VMOVDQU 2944(CX), Y11 - VMOVDQU 2976(CX), Y12 + VMOVDQU 5504(CX), Y11 + VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 - VMOVDQU 3008(CX), Y11 - VMOVDQU 3040(CX), Y12 + VMOVDQU 5568(CX), Y11 + VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 - VMOVDQU 3072(CX), Y11 - VMOVDQU 3104(CX), Y12 + VMOVDQU 5632(CX), Y11 + VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 - VMOVDQU 3136(CX), Y11 - VMOVDQU 3168(CX), Y12 + VMOVDQU 5696(CX), Y11 + VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 5 to 10 outputs - VMOVDQU (R10), Y13 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 3200(CX), Y11 - VMOVDQU 3232(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 3264(CX), Y11 - VMOVDQU 3296(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 3328(CX), Y11 - VMOVDQU 3360(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 3392(CX), Y11 - VMOVDQU 3424(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 3456(CX), Y11 - VMOVDQU 3488(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 3520(CX), Y11 - VMOVDQU 3552(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 3584(CX), Y11 - VMOVDQU 3616(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 3648(CX), Y11 - VMOVDQU 3680(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 3712(CX), Y11 - VMOVDQU 3744(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 3776(CX), Y11 - VMOVDQU 3808(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x10Xor_loop + VZEROUPPER + +mulAvxTwo_9x10Xor_end: + RET + +// func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x1(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x1_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ (R14), R14 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X1 + VPBROADCASTB X1, Y1 + +mulAvxTwo_10x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y4 + ADDQ $0x20, SI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y4 + ADDQ $0x20, DI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y4 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y4 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y4 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y4 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R12), Y4 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 448(CX), Y2 + VMOVDQU 480(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R13), Y4 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 512(CX), Y2 + VMOVDQU 544(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 576(CX), Y2 + VMOVDQU 608(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_10x1_loop + VZEROUPPER + +mulAvxTwo_10x1_end: + RET + +// func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_10x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R14 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_10x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 - // Load and process 32 bytes from input 6 to 10 outputs - VMOVDQU (R11), Y13 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 3840(CX), Y11 - VMOVDQU 3872(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 3904(CX), Y11 - VMOVDQU 3936(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 3968(CX), Y11 - VMOVDQU 4000(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 4032(CX), Y11 - VMOVDQU 4064(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 4096(CX), Y11 - VMOVDQU 4128(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 4160(CX), Y11 - VMOVDQU 4192(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 4224(CX), Y11 - VMOVDQU 4256(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 4288(CX), Y11 - VMOVDQU 4320(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 4352(CX), Y11 - VMOVDQU 4384(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 4416(CX), Y11 - VMOVDQU 4448(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU (R12), Y6 + VMOVDQU 32(R12), Y5 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 - // Load and process 32 bytes from input 7 to 10 outputs - VMOVDQU (DX), Y13 - ADDQ $0x20, DX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 4480(CX), Y11 - VMOVDQU 4512(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 4544(CX), Y11 - VMOVDQU 4576(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 4608(CX), Y11 - VMOVDQU 4640(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 4672(CX), Y11 - VMOVDQU 4704(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 4736(CX), Y11 - VMOVDQU 4768(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 4800(CX), Y11 - VMOVDQU 4832(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 4864(CX), Y11 - VMOVDQU 4896(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 4928(CX), Y11 - VMOVDQU 4960(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 4992(CX), Y11 - VMOVDQU 5024(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 5056(CX), Y11 - VMOVDQU 5088(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU (R13), Y6 + VMOVDQU 32(R13), Y5 + ADDQ $0x40, R13 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 - // Store 10 outputs - MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) - MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) - MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) - MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) - MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) - MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) - MOVQ 144(R12), R14 - VMOVDQU Y6, (R14)(R13*1) - MOVQ 168(R12), R14 - VMOVDQU Y7, (R14)(R13*1) - MOVQ 192(R12), R14 - VMOVDQU Y8, (R14)(R13*1) - MOVQ 216(R12), R14 - VMOVDQU Y9, (R14)(R13*1) + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 // Prepare for next loop - ADDQ $0x20, R13 DECQ AX - JNZ mulAvxTwo_8x10_loop + JNZ mulAvxTwo_10x1_64_loop VZEROUPPER -mulAvxTwo_8x10_end: +mulAvxTwo_10x1_64_end: RET -// func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_9x1(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_10x1Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 22 YMM used + // Full registers estimated 24 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_9x1_end + JZ mulAvxTwo_10x1Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -24576,38 +58981,38 @@ TEXT ·mulAvxTwo_9x1(SB), NOSPLIT, $0-88 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R13 - MOVQ start+72(FP), R14 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ (R14), R14 + MOVQ start+72(FP), R15 // Add start offset to output - ADDQ R14, R13 + ADDQ R15, R14 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X1 + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X1 VPBROADCASTB X1, Y1 -mulAvxTwo_9x1_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - +mulAvxTwo_10x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (BX), Y4 ADDQ $0x20, BX VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 + VMOVDQU (R14), Y0 VMOVDQU (CX), Y2 VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 @@ -24707,8 +59112,8 @@ mulAvxTwo_9x1_loop: VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 8 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX + VMOVDQU (R13), Y4 + ADDQ $0x20, R13 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 @@ -24719,68 +59124,85 @@ mulAvxTwo_9x1_loop: VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 576(CX), Y2 + VMOVDQU 608(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + // Store 1 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 + VMOVDQU Y0, (R14) + ADDQ $0x20, R14 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_9x1_loop + JNZ mulAvxTwo_10x1Xor_loop VZEROUPPER -mulAvxTwo_9x1_end: +mulAvxTwo_10x1Xor_end: RET -// func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_9x1_64(SB), $0-88 +TEXT ·mulAvxTwo_10x1_64Xor(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 22 YMM used + // Destination kept in GP registers + // Full registers estimated 46 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_9x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), AX - MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 - MOVQ start+72(FP), R13 + JZ mulAvxTwo_10x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R14 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 // Add start offset to input - ADDQ R13, DX - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, AX - MOVQ $0x0000000f, R14 - MOVQ R14, X2 + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X2 VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R14 - SHRQ $0x06, R14 -mulAvxTwo_9x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 +mulAvxTwo_10x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R14), Y0 + VMOVDQU 32(R14), Y1 // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24799,9 +59221,9 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24820,9 +59242,9 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y6 - VMOVDQU 32(SI), Y5 - ADDQ $0x40, SI + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24841,9 +59263,9 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y6 - VMOVDQU 32(DI), Y5 - ADDQ $0x40, DI + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24862,9 +59284,9 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y6 - VMOVDQU 32(R8), Y5 - ADDQ $0x40, R8 + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24883,9 +59305,9 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs - VMOVDQU (R9), Y6 - VMOVDQU 32(R9), Y5 - ADDQ $0x40, R9 + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24904,9 +59326,9 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs - VMOVDQU (R10), Y6 - VMOVDQU 32(R10), Y5 - ADDQ $0x40, R10 + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24925,9 +59347,9 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 7 to 1 outputs - VMOVDQU (R11), Y6 - VMOVDQU 32(R11), Y5 - ADDQ $0x40, R11 + VMOVDQU (R12), Y6 + VMOVDQU 32(R12), Y5 + ADDQ $0x40, R12 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24945,18 +59367,39 @@ mulAvxTwo_9x1_64_loop: VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 - // Load and process 64 bytes from input 8 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU (R13), Y6 + VMOVDQU 32(R13), Y5 + ADDQ $0x40, R13 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 @@ -24967,30 +59410,29 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Store 1 outputs - MOVQ (R12), R15 - VMOVDQU Y0, (R15)(R13*1) - VMOVDQU Y1, 32(R15)(R13*1) + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 // Prepare for next loop - ADDQ $0x40, R13 - DECQ R14 - JNZ mulAvxTwo_9x1_64_loop + DECQ AX + JNZ mulAvxTwo_10x1_64Xor_loop VZEROUPPER -mulAvxTwo_9x1_64_end: +mulAvxTwo_10x1_64Xor_end: RET -// func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_9x2(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_10x2(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 43 YMM used + // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_9x2_end + JZ mulAvxTwo_10x2_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -25000,35 +59442,33 @@ TEXT ·mulAvxTwo_9x2(SB), NOSPLIT, $0-88 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R13 - MOVQ start+72(FP), R15 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP // Add start offset to output - ADDQ R15, R14 - ADDQ R15, R13 + ADDQ BP, R15 + ADDQ BP, R14 // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X2 + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X2 VPBROADCASTB X2, Y2 -mulAvxTwo_9x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - +mulAvxTwo_10x2_loop: // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX @@ -25039,14 +59479,12 @@ mulAvxTwo_9x2_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + VPXOR Y3, Y4, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y3, Y4, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -25182,8 +59620,8 @@ mulAvxTwo_9x2_loop: VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 8 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX + VMOVDQU (R13), Y5 + ADDQ $0x20, R13 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 @@ -25200,72 +59638,91 @@ mulAvxTwo_9x2_loop: VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 1152(CX), Y3 + VMOVDQU 1184(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 1216(CX), Y3 + VMOVDQU 1248(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + // Store 2 outputs - VMOVDQU Y0, (R14) + VMOVDQU Y0, (R15) + ADDQ $0x20, R15 + VMOVDQU Y1, (R14) ADDQ $0x20, R14 - VMOVDQU Y1, (R13) - ADDQ $0x20, R13 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_9x2_loop + JNZ mulAvxTwo_10x2_loop VZEROUPPER -mulAvxTwo_9x2_end: +mulAvxTwo_10x2_end: RET -// func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_9x2_64(SB), $0-88 +TEXT ·mulAvxTwo_10x2_64(SB), $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 43 YMM used + // Destination kept in GP registers + // Full registers estimated 89 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_9x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), AX - MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 - MOVQ start+72(FP), R13 + JZ mulAvxTwo_10x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 // Add start offset to input - ADDQ R13, DX - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, AX - MOVQ $0x0000000f, R14 - MOVQ R14, X4 + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R14 - SHRQ $0x06, R14 - -mulAvxTwo_9x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 +mulAvxTwo_10x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25278,25 +59735,21 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25325,9 +59778,9 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (SI), Y9 - VMOVDQU 32(SI), Y11 - ADDQ $0x40, SI + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25356,9 +59809,9 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs - VMOVDQU (DI), Y9 - VMOVDQU 32(DI), Y11 - ADDQ $0x40, DI + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25387,9 +59840,9 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs - VMOVDQU (R8), Y9 - VMOVDQU 32(R8), Y11 - ADDQ $0x40, R8 + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25418,9 +59871,9 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs - VMOVDQU (R9), Y9 - VMOVDQU 32(R9), Y11 - ADDQ $0x40, R9 + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25449,9 +59902,9 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs - VMOVDQU (R10), Y9 - VMOVDQU 32(R10), Y11 - ADDQ $0x40, R10 + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25480,9 +59933,9 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 7 to 2 outputs - VMOVDQU (R11), Y9 - VMOVDQU 32(R11), Y11 - ADDQ $0x40, R11 + VMOVDQU (R12), Y9 + VMOVDQU 32(R12), Y11 + ADDQ $0x40, R12 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25511,9 +59964,9 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 8 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (R13), Y9 + VMOVDQU 32(R13), Y11 + ADDQ $0x40, R13 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25541,791 +59994,339 @@ mulAvxTwo_9x2_64_loop: VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 - // Store 2 outputs - MOVQ (R12), R15 - VMOVDQU Y0, (R15)(R13*1) - VMOVDQU Y1, 32(R15)(R13*1) - MOVQ 24(R12), R15 - VMOVDQU Y2, (R15)(R13*1) - VMOVDQU Y3, 32(R15)(R13*1) - - // Prepare for next loop - ADDQ $0x40, R13 - DECQ R14 - JNZ mulAvxTwo_9x2_64_loop - VZEROUPPER - -mulAvxTwo_9x2_64_end: - RET - -// func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_9x3(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 62 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R13 - - // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_9x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (R12), Y6 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 8 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1536(CX), Y4 - VMOVDQU 1568(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 1600(CX), Y4 - VMOVDQU 1632(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 1664(CX), Y4 - VMOVDQU 1696(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 - // Store 3 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - VMOVDQU Y1, (R15) - ADDQ $0x20, R15 - VMOVDQU Y2, (R13) - ADDQ $0x20, R13 + // Store 2 outputs + VMOVDQU Y0, (R15) + VMOVDQU Y1, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y2, (R14) + VMOVDQU Y3, 32(R14) + ADDQ $0x40, R14 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_9x3_loop + JNZ mulAvxTwo_10x2_64_loop VZEROUPPER -mulAvxTwo_9x3_end: +mulAvxTwo_10x2_64_end: RET -// func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_9x3_64(SB), $0-88 +TEXT ·mulAvxTwo_10x2Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 62 YMM used + // Destination kept in GP registers + // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX + SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_9x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), AX - MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 - MOVQ start+72(FP), R13 + JZ mulAvxTwo_10x2Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP - // Add start offset to input - ADDQ R13, DX - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, AX - MOVQ $0x0000000f, R14 - MOVQ R14, X6 - VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R14 - SHRQ $0x06, R14 + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 -mulAvxTwo_9x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X2 + VPBROADCASTB X2, Y2 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU (CX), Y7 - VMOVDQU 32(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 - VMOVDQU 64(CX), Y7 - VMOVDQU 96(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 - VMOVDQU 128(CX), Y7 - VMOVDQU 160(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +mulAvxTwo_10x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R15), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (R14), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 - // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU 192(CX), Y7 - VMOVDQU 224(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 - VMOVDQU 256(CX), Y7 - VMOVDQU 288(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 - VMOVDQU 320(CX), Y7 - VMOVDQU 352(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 - // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y11 - VMOVDQU 32(SI), Y13 - ADDQ $0x40, SI - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU 384(CX), Y7 - VMOVDQU 416(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 - VMOVDQU 448(CX), Y7 - VMOVDQU 480(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 - VMOVDQU 512(CX), Y7 - VMOVDQU 544(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 - // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y11 - VMOVDQU 32(DI), Y13 - ADDQ $0x40, DI - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU 576(CX), Y7 - VMOVDQU 608(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 - VMOVDQU 640(CX), Y7 - VMOVDQU 672(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 - VMOVDQU 704(CX), Y7 - VMOVDQU 736(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 - // Load and process 64 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y11 - VMOVDQU 32(R8), Y13 - ADDQ $0x40, R8 - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU 768(CX), Y7 - VMOVDQU 800(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 - VMOVDQU 832(CX), Y7 - VMOVDQU 864(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 - VMOVDQU 896(CX), Y7 - VMOVDQU 928(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y5 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 - // Load and process 64 bytes from input 5 to 3 outputs - VMOVDQU (R9), Y11 - VMOVDQU 32(R9), Y13 - ADDQ $0x40, R9 - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU 960(CX), Y7 - VMOVDQU 992(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 - VMOVDQU 1024(CX), Y7 - VMOVDQU 1056(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 - VMOVDQU 1088(CX), Y7 - VMOVDQU 1120(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y5 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 - // Load and process 64 bytes from input 6 to 3 outputs - VMOVDQU (R10), Y11 - VMOVDQU 32(R10), Y13 - ADDQ $0x40, R10 - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU 1152(CX), Y7 - VMOVDQU 1184(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 - VMOVDQU 1216(CX), Y7 - VMOVDQU 1248(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 - VMOVDQU 1280(CX), Y7 - VMOVDQU 1312(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y5 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 - // Load and process 64 bytes from input 7 to 3 outputs - VMOVDQU (R11), Y11 - VMOVDQU 32(R11), Y13 - ADDQ $0x40, R11 - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU 1344(CX), Y7 - VMOVDQU 1376(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 - VMOVDQU 1408(CX), Y7 - VMOVDQU 1440(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 - VMOVDQU 1472(CX), Y7 - VMOVDQU 1504(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y5 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 - // Load and process 64 bytes from input 8 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 - VMOVDQU 1536(CX), Y7 - VMOVDQU 1568(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 - VMOVDQU 1600(CX), Y7 - VMOVDQU 1632(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 - VMOVDQU 1664(CX), Y7 - VMOVDQU 1696(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y5 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 1024(CX), Y3 + VMOVDQU 1056(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 1088(CX), Y3 + VMOVDQU 1120(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 - // Store 3 outputs - MOVQ (R12), R15 - VMOVDQU Y0, (R15)(R13*1) - VMOVDQU Y1, 32(R15)(R13*1) - MOVQ 24(R12), R15 - VMOVDQU Y2, (R15)(R13*1) - VMOVDQU Y3, 32(R15)(R13*1) - MOVQ 48(R12), R15 - VMOVDQU Y4, (R15)(R13*1) - VMOVDQU Y5, 32(R15)(R13*1) - - // Prepare for next loop - ADDQ $0x40, R13 - DECQ R14 - JNZ mulAvxTwo_9x3_64_loop + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 1152(CX), Y3 + VMOVDQU 1184(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 1216(CX), Y3 + VMOVDQU 1248(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R15) + ADDQ $0x20, R15 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_10x2Xor_loop VZEROUPPER -mulAvxTwo_9x3_64_end: +mulAvxTwo_10x2Xor_end: RET -// func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_9x4(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_10x2_64Xor(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers - // Full registers estimated 81 YMM used + // Full registers estimated 89 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX + SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_9x4_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), AX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R14 - MOVQ 48(R12), R15 - MOVQ 72(R12), R12 + JZ mulAvxTwo_10x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 MOVQ start+72(FP), BP // Add start offset to output - ADDQ BP, R13 - ADDQ BP, R14 ADDQ BP, R15 - ADDQ BP, R12 + ADDQ BP, R14 // Add start offset to input - ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI @@ -26333,2270 +60334,1959 @@ TEXT ·mulAvxTwo_9x4(SB), NOSPLIT, $8-88 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 - ADDQ BP, AX + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), BP - SHRQ $0x05, BP -mulAvxTwo_9x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 +mulAvxTwo_10x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R15), Y0 + VMOVDQU 32(R15), Y1 + VMOVDQU (R14), Y2 + VMOVDQU 32(R14), Y3 - // Load and process 32 bytes from input 0 to 4 outputs - VMOVDQU (DX), Y7 - ADDQ $0x20, DX - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 - // Load and process 32 bytes from input 1 to 4 outputs - VMOVDQU (BX), Y7 - ADDQ $0x20, BX - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 - // Load and process 32 bytes from input 2 to 4 outputs - VMOVDQU (SI), Y7 - ADDQ $0x20, SI - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 - // Load and process 32 bytes from input 3 to 4 outputs - VMOVDQU (DI), Y7 - ADDQ $0x20, DI - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 896(CX), Y5 - VMOVDQU 928(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 960(CX), Y5 - VMOVDQU 992(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 - - // Load and process 32 bytes from input 4 to 4 outputs - VMOVDQU (R8), Y7 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1024(CX), Y5 - VMOVDQU 1056(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1088(CX), Y5 - VMOVDQU 1120(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1152(CX), Y5 - VMOVDQU 1184(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 1216(CX), Y5 - VMOVDQU 1248(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 - - // Load and process 32 bytes from input 5 to 4 outputs - VMOVDQU (R9), Y7 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1280(CX), Y5 - VMOVDQU 1312(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1344(CX), Y5 - VMOVDQU 1376(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1408(CX), Y5 - VMOVDQU 1440(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 - VMOVDQU 1472(CX), Y5 - VMOVDQU 1504(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y7, Y3, Y3 - // Load and process 32 bytes from input 6 to 4 outputs - VMOVDQU (R10), Y7 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1536(CX), Y5 - VMOVDQU 1568(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y9 + VMOVDQU 32(R12), Y11 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 - VMOVDQU 1600(CX), Y5 - VMOVDQU 1632(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1664(CX), Y5 - VMOVDQU 1696(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPXOR Y7, Y1, Y1 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 - VMOVDQU 1728(CX), Y5 - VMOVDQU 1760(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y7, Y3, Y3 - // Load and process 32 bytes from input 7 to 4 outputs - VMOVDQU (R11), Y7 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1792(CX), Y5 - VMOVDQU 1824(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y9 + VMOVDQU 32(R13), Y11 + ADDQ $0x40, R13 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 - VMOVDQU 1856(CX), Y5 - VMOVDQU 1888(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1920(CX), Y5 - VMOVDQU 1952(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPXOR Y7, Y1, Y1 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 - VMOVDQU 1984(CX), Y5 - VMOVDQU 2016(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y7, Y3, Y3 - // Load and process 32 bytes from input 8 to 4 outputs - VMOVDQU (AX), Y7 - ADDQ $0x20, AX - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 2048(CX), Y5 - VMOVDQU 2080(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 - VMOVDQU 2112(CX), Y5 - VMOVDQU 2144(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 2176(CX), Y5 - VMOVDQU 2208(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 + VPXOR Y7, Y1, Y1 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 - VMOVDQU 2240(CX), Y5 - VMOVDQU 2272(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y7, Y3, Y3 - // Store 4 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - VMOVDQU Y1, (R14) - ADDQ $0x20, R14 - VMOVDQU Y2, (R15) - ADDQ $0x20, R15 - VMOVDQU Y3, (R12) - ADDQ $0x20, R12 + // Store 2 outputs + VMOVDQU Y0, (R15) + VMOVDQU Y1, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y2, (R14) + VMOVDQU Y3, 32(R14) + ADDQ $0x40, R14 // Prepare for next loop - DECQ BP - JNZ mulAvxTwo_9x4_loop + DECQ AX + JNZ mulAvxTwo_10x2_64Xor_loop VZEROUPPER -mulAvxTwo_9x4_end: +mulAvxTwo_10x2_64Xor_end: RET -// func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_9x5(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_10x3(SB), NOSPLIT, $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 100 YMM used + // Destination kept in GP registers + // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_9x5_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX + JZ mulAvxTwo_10x3_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 - MOVQ start+72(FP), R14 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X5 - VPBROADCASTB X5, Y5 + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X3 + VPBROADCASTB X3, Y3 + MOVQ n+80(FP), BP + SHRQ $0x05, BP -mulAvxTwo_9x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 +mulAvxTwo_10x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y2 - // Load and process 32 bytes from input 0 to 5 outputs - VMOVDQU (BX), Y8 + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y6 ADDQ $0x20, BX - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU (CX), Y6 - VMOVDQU 32(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 64(CX), Y6 - VMOVDQU 96(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 128(CX), Y6 - VMOVDQU 160(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 256(CX), Y6 - VMOVDQU 288(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 1 to 5 outputs - VMOVDQU (SI), Y8 + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y6 ADDQ $0x20, SI - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 320(CX), Y6 - VMOVDQU 352(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 384(CX), Y6 - VMOVDQU 416(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 448(CX), Y6 - VMOVDQU 480(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 512(CX), Y6 - VMOVDQU 544(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 576(CX), Y6 - VMOVDQU 608(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 2 to 5 outputs - VMOVDQU (DI), Y8 + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y6 ADDQ $0x20, DI - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 640(CX), Y6 - VMOVDQU 672(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 704(CX), Y6 - VMOVDQU 736(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 768(CX), Y6 - VMOVDQU 800(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 832(CX), Y6 - VMOVDQU 864(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 896(CX), Y6 - VMOVDQU 928(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 3 to 5 outputs - VMOVDQU (R8), Y8 + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y6 ADDQ $0x20, R8 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 960(CX), Y6 - VMOVDQU 992(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1024(CX), Y6 - VMOVDQU 1056(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1088(CX), Y6 - VMOVDQU 1120(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1152(CX), Y6 - VMOVDQU 1184(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1216(CX), Y6 - VMOVDQU 1248(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 4 to 5 outputs - VMOVDQU (R9), Y8 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 1280(CX), Y6 - VMOVDQU 1312(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1344(CX), Y6 - VMOVDQU 1376(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1408(CX), Y6 - VMOVDQU 1440(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1472(CX), Y6 - VMOVDQU 1504(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1536(CX), Y6 - VMOVDQU 1568(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y6 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 5 to 5 outputs - VMOVDQU (R10), Y8 + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y6 ADDQ $0x20, R10 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 1600(CX), Y6 - VMOVDQU 1632(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1664(CX), Y6 - VMOVDQU 1696(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1728(CX), Y6 - VMOVDQU 1760(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1792(CX), Y6 - VMOVDQU 1824(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1856(CX), Y6 - VMOVDQU 1888(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 6 to 5 outputs - VMOVDQU (R11), Y8 + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y6 ADDQ $0x20, R11 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 1920(CX), Y6 - VMOVDQU 1952(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1984(CX), Y6 - VMOVDQU 2016(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 2048(CX), Y6 - VMOVDQU 2080(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 2112(CX), Y6 - VMOVDQU 2144(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 2176(CX), Y6 - VMOVDQU 2208(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1344(CX), Y4 + VMOVDQU 1376(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1408(CX), Y4 + VMOVDQU 1440(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1472(CX), Y4 + VMOVDQU 1504(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 7 to 5 outputs - VMOVDQU (R12), Y8 + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y6 ADDQ $0x20, R12 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 2240(CX), Y6 - VMOVDQU 2272(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 2304(CX), Y6 - VMOVDQU 2336(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 2368(CX), Y6 - VMOVDQU 2400(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 2432(CX), Y6 - VMOVDQU 2464(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 2496(CX), Y6 - VMOVDQU 2528(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1536(CX), Y4 + VMOVDQU 1568(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1600(CX), Y4 + VMOVDQU 1632(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1664(CX), Y4 + VMOVDQU 1696(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 8 to 5 outputs - VMOVDQU (DX), Y8 - ADDQ $0x20, DX - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 2560(CX), Y6 - VMOVDQU 2592(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 2624(CX), Y6 - VMOVDQU 2656(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 2688(CX), Y6 - VMOVDQU 2720(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 2752(CX), Y6 - VMOVDQU 2784(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 2816(CX), Y6 - VMOVDQU 2848(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y6 + ADDQ $0x20, AX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1728(CX), Y4 + VMOVDQU 1760(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1792(CX), Y4 + VMOVDQU 1824(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1856(CX), Y4 + VMOVDQU 1888(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Store 5 outputs - MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) - MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) - MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) - MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) - MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) + // Store 3 outputs + VMOVDQU Y0, (R14) + ADDQ $0x20, R14 + VMOVDQU Y1, (R15) + ADDQ $0x20, R15 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 // Prepare for next loop - ADDQ $0x20, R14 - DECQ AX - JNZ mulAvxTwo_9x5_loop + DECQ BP + JNZ mulAvxTwo_10x3_loop VZEROUPPER -mulAvxTwo_9x5_end: +mulAvxTwo_10x3_end: RET -// func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_9x6(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_10x3_64(SB), $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 119 YMM used + // Destination kept in GP registers + // Full registers estimated 130 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX + SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_9x6_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX + JZ mulAvxTwo_10x3_64_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 - MOVQ start+72(FP), R14 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X6 + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X6 VPBROADCASTB X6, Y6 -mulAvxTwo_9x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP - // Load and process 32 bytes from input 0 to 6 outputs - VMOVDQU (BX), Y9 - ADDQ $0x20, BX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 +mulAvxTwo_10x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 1 to 6 outputs - VMOVDQU (SI), Y9 - ADDQ $0x20, SI - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 2 to 6 outputs - VMOVDQU (DI), Y9 - ADDQ $0x20, DI - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 3 to 6 outputs - VMOVDQU (R8), Y9 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 4 to 6 outputs - VMOVDQU (R9), Y9 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU 32(R12), Y13 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y13 + ADDQ $0x40, AX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 - - // Load and process 32 bytes from input 5 to 6 outputs - VMOVDQU (R10), Y9 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 1920(CX), Y7 - VMOVDQU 1952(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 1984(CX), Y7 - VMOVDQU 2016(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 2048(CX), Y7 - VMOVDQU 2080(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 2112(CX), Y7 - VMOVDQU 2144(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 2176(CX), Y7 - VMOVDQU 2208(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 2240(CX), Y7 - VMOVDQU 2272(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 - - // Load and process 32 bytes from input 6 to 6 outputs - VMOVDQU (R11), Y9 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 2304(CX), Y7 - VMOVDQU 2336(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 2368(CX), Y7 - VMOVDQU 2400(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 2432(CX), Y7 - VMOVDQU 2464(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 2496(CX), Y7 - VMOVDQU 2528(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 2560(CX), Y7 - VMOVDQU 2592(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 2624(CX), Y7 - VMOVDQU 2656(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 - - // Load and process 32 bytes from input 7 to 6 outputs - VMOVDQU (R12), Y9 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 2688(CX), Y7 - VMOVDQU 2720(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 2752(CX), Y7 - VMOVDQU 2784(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 2816(CX), Y7 - VMOVDQU 2848(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 2880(CX), Y7 - VMOVDQU 2912(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 2944(CX), Y7 - VMOVDQU 2976(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 3008(CX), Y7 - VMOVDQU 3040(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 - - // Load and process 32 bytes from input 8 to 6 outputs - VMOVDQU (DX), Y9 - ADDQ $0x20, DX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 3072(CX), Y7 - VMOVDQU 3104(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 3136(CX), Y7 - VMOVDQU 3168(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 3200(CX), Y7 - VMOVDQU 3232(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 3264(CX), Y7 - VMOVDQU 3296(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 3328(CX), Y7 - VMOVDQU 3360(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 - VMOVDQU 3392(CX), Y7 - VMOVDQU 3424(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y9, Y5, Y5 - // Store 6 outputs - MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) - MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) - MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) - MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) - MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) - MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) + // Store 3 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R15) + VMOVDQU Y3, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y4, (R13) + VMOVDQU Y5, 32(R13) + ADDQ $0x40, R13 // Prepare for next loop - ADDQ $0x20, R14 - DECQ AX - JNZ mulAvxTwo_9x6_loop + DECQ BP + JNZ mulAvxTwo_10x3_64_loop VZEROUPPER -mulAvxTwo_9x6_end: +mulAvxTwo_10x3_64_end: RET -// func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_9x7(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_10x3Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 138 YMM used + // Destination kept in GP registers + // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_9x7_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX + JZ mulAvxTwo_10x3Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 - MOVQ start+72(FP), R14 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X7 - VPBROADCASTB X7, Y7 + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X3 + VPBROADCASTB X3, Y3 + MOVQ n+80(FP), BP + SHRQ $0x05, BP -mulAvxTwo_9x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 +mulAvxTwo_10x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R14), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R15), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 0 to 7 outputs - VMOVDQU (BX), Y10 + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y6 ADDQ $0x20, BX - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU (CX), Y8 - VMOVDQU 32(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 64(CX), Y8 - VMOVDQU 96(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 128(CX), Y8 - VMOVDQU 160(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 192(CX), Y8 - VMOVDQU 224(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 320(CX), Y8 - VMOVDQU 352(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 384(CX), Y8 - VMOVDQU 416(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 1 to 7 outputs - VMOVDQU (SI), Y10 + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y6 ADDQ $0x20, SI - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 448(CX), Y8 - VMOVDQU 480(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 512(CX), Y8 - VMOVDQU 544(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 576(CX), Y8 - VMOVDQU 608(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 640(CX), Y8 - VMOVDQU 672(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 704(CX), Y8 - VMOVDQU 736(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 768(CX), Y8 - VMOVDQU 800(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 832(CX), Y8 - VMOVDQU 864(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 2 to 7 outputs - VMOVDQU (DI), Y10 + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y6 ADDQ $0x20, DI - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 896(CX), Y8 - VMOVDQU 928(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 960(CX), Y8 - VMOVDQU 992(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1024(CX), Y8 - VMOVDQU 1056(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1088(CX), Y8 - VMOVDQU 1120(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 1152(CX), Y8 - VMOVDQU 1184(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 1216(CX), Y8 - VMOVDQU 1248(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 1280(CX), Y8 - VMOVDQU 1312(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 3 to 7 outputs - VMOVDQU (R8), Y10 + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y6 ADDQ $0x20, R8 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 1344(CX), Y8 - VMOVDQU 1376(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 1408(CX), Y8 - VMOVDQU 1440(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1472(CX), Y8 - VMOVDQU 1504(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1536(CX), Y8 - VMOVDQU 1568(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 1600(CX), Y8 - VMOVDQU 1632(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 1664(CX), Y8 - VMOVDQU 1696(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 1728(CX), Y8 - VMOVDQU 1760(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 4 to 7 outputs - VMOVDQU (R9), Y10 + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y6 ADDQ $0x20, R9 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 1792(CX), Y8 - VMOVDQU 1824(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 1856(CX), Y8 - VMOVDQU 1888(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1920(CX), Y8 - VMOVDQU 1952(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1984(CX), Y8 - VMOVDQU 2016(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 2048(CX), Y8 - VMOVDQU 2080(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 2112(CX), Y8 - VMOVDQU 2144(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 2176(CX), Y8 - VMOVDQU 2208(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 5 to 7 outputs - VMOVDQU (R10), Y10 + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y6 ADDQ $0x20, R10 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 2240(CX), Y8 - VMOVDQU 2272(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 2304(CX), Y8 - VMOVDQU 2336(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 2368(CX), Y8 - VMOVDQU 2400(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 2432(CX), Y8 - VMOVDQU 2464(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 2496(CX), Y8 - VMOVDQU 2528(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 2560(CX), Y8 - VMOVDQU 2592(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 2624(CX), Y8 - VMOVDQU 2656(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 6 to 7 outputs - VMOVDQU (R11), Y10 + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y6 ADDQ $0x20, R11 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 2688(CX), Y8 - VMOVDQU 2720(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 2752(CX), Y8 - VMOVDQU 2784(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 2816(CX), Y8 - VMOVDQU 2848(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 2880(CX), Y8 - VMOVDQU 2912(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 2944(CX), Y8 - VMOVDQU 2976(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 3008(CX), Y8 - VMOVDQU 3040(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 3072(CX), Y8 - VMOVDQU 3104(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1344(CX), Y4 + VMOVDQU 1376(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1408(CX), Y4 + VMOVDQU 1440(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1472(CX), Y4 + VMOVDQU 1504(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 7 to 7 outputs - VMOVDQU (R12), Y10 + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y6 ADDQ $0x20, R12 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 3136(CX), Y8 - VMOVDQU 3168(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 3200(CX), Y8 - VMOVDQU 3232(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 3264(CX), Y8 - VMOVDQU 3296(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 3328(CX), Y8 - VMOVDQU 3360(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 3392(CX), Y8 - VMOVDQU 3424(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 3456(CX), Y8 - VMOVDQU 3488(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 3520(CX), Y8 - VMOVDQU 3552(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1536(CX), Y4 + VMOVDQU 1568(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1600(CX), Y4 + VMOVDQU 1632(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1664(CX), Y4 + VMOVDQU 1696(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Load and process 32 bytes from input 8 to 7 outputs - VMOVDQU (DX), Y10 - ADDQ $0x20, DX - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 3584(CX), Y8 - VMOVDQU 3616(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 3648(CX), Y8 - VMOVDQU 3680(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 3712(CX), Y8 - VMOVDQU 3744(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 3776(CX), Y8 - VMOVDQU 3808(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 3840(CX), Y8 - VMOVDQU 3872(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 3904(CX), Y8 - VMOVDQU 3936(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 3968(CX), Y8 - VMOVDQU 4000(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y6 + ADDQ $0x20, AX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1728(CX), Y4 + VMOVDQU 1760(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1792(CX), Y4 + VMOVDQU 1824(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1856(CX), Y4 + VMOVDQU 1888(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 - // Store 7 outputs - MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) - MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) - MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) - MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) - MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) - MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) - MOVQ 144(R13), R15 - VMOVDQU Y6, (R15)(R14*1) + // Store 3 outputs + VMOVDQU Y0, (R14) + ADDQ $0x20, R14 + VMOVDQU Y1, (R15) + ADDQ $0x20, R15 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 // Prepare for next loop - ADDQ $0x20, R14 - DECQ AX - JNZ mulAvxTwo_9x7_loop + DECQ BP + JNZ mulAvxTwo_10x3Xor_loop VZEROUPPER -mulAvxTwo_9x7_end: +mulAvxTwo_10x3Xor_end: RET -// func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_9x8(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_10x3_64Xor(SB), $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 157 YMM used + // Destination kept in GP registers + // Full registers estimated 130 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX + SHRQ $0x06, AX TESTQ AX, AX - JZ mulAvxTwo_9x8_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX + JZ mulAvxTwo_10x3_64Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 - MOVQ start+72(FP), R14 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X8 - VPBROADCASTB X8, Y8 + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 -mulAvxTwo_9x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP - // Load and process 32 bytes from input 0 to 8 outputs - VMOVDQU (BX), Y11 - ADDQ $0x20, BX +mulAvxTwo_10x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y0 + VMOVDQU 32(R14), Y1 + VMOVDQU (R15), Y2 + VMOVDQU 32(R15), Y3 + VMOVDQU (R13), Y4 + VMOVDQU 32(R13), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU (CX), Y9 - VMOVDQU 32(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 64(CX), Y9 - VMOVDQU 96(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 128(CX), Y9 - VMOVDQU 160(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 192(CX), Y9 - VMOVDQU 224(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 256(CX), Y9 - VMOVDQU 288(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 320(CX), Y9 - VMOVDQU 352(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 384(CX), Y9 - VMOVDQU 416(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 448(CX), Y9 - VMOVDQU 480(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 1 to 8 outputs - VMOVDQU (SI), Y11 - ADDQ $0x20, SI + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 512(CX), Y9 - VMOVDQU 544(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 576(CX), Y9 - VMOVDQU 608(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 640(CX), Y9 - VMOVDQU 672(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 704(CX), Y9 - VMOVDQU 736(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 768(CX), Y9 - VMOVDQU 800(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 832(CX), Y9 - VMOVDQU 864(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 896(CX), Y9 - VMOVDQU 928(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 960(CX), Y9 - VMOVDQU 992(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 2 to 8 outputs - VMOVDQU (DI), Y11 - ADDQ $0x20, DI + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 1024(CX), Y9 - VMOVDQU 1056(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 1088(CX), Y9 - VMOVDQU 1120(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 1152(CX), Y9 - VMOVDQU 1184(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 1216(CX), Y9 - VMOVDQU 1248(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 1280(CX), Y9 - VMOVDQU 1312(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 1344(CX), Y9 - VMOVDQU 1376(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 1408(CX), Y9 - VMOVDQU 1440(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 1472(CX), Y9 - VMOVDQU 1504(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 - // Load and process 32 bytes from input 3 to 8 outputs + // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R8), Y11 - ADDQ $0x20, R8 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 1536(CX), Y9 - VMOVDQU 1568(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 1600(CX), Y9 - VMOVDQU 1632(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 1664(CX), Y9 - VMOVDQU 1696(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 1728(CX), Y9 - VMOVDQU 1760(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 1792(CX), Y9 - VMOVDQU 1824(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 1856(CX), Y9 - VMOVDQU 1888(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 1920(CX), Y9 - VMOVDQU 1952(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 1984(CX), Y9 - VMOVDQU 2016(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 4 to 8 outputs + // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R9), Y11 - ADDQ $0x20, R9 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 2048(CX), Y9 - VMOVDQU 2080(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 2112(CX), Y9 - VMOVDQU 2144(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 2176(CX), Y9 - VMOVDQU 2208(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 2240(CX), Y9 - VMOVDQU 2272(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 2304(CX), Y9 - VMOVDQU 2336(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 2368(CX), Y9 - VMOVDQU 2400(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 2432(CX), Y9 - VMOVDQU 2464(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 2496(CX), Y9 - VMOVDQU 2528(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 5 to 8 outputs + // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R10), Y11 - ADDQ $0x20, R10 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 2560(CX), Y9 - VMOVDQU 2592(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 2624(CX), Y9 - VMOVDQU 2656(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 2688(CX), Y9 - VMOVDQU 2720(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 2752(CX), Y9 - VMOVDQU 2784(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 2816(CX), Y9 - VMOVDQU 2848(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 2880(CX), Y9 - VMOVDQU 2912(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 2944(CX), Y9 - VMOVDQU 2976(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 3008(CX), Y9 - VMOVDQU 3040(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 6 to 8 outputs + // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R11), Y11 - ADDQ $0x20, R11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 3072(CX), Y9 - VMOVDQU 3104(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 3136(CX), Y9 - VMOVDQU 3168(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 3200(CX), Y9 - VMOVDQU 3232(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 3264(CX), Y9 - VMOVDQU 3296(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 3328(CX), Y9 - VMOVDQU 3360(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 3392(CX), Y9 - VMOVDQU 3424(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 3456(CX), Y9 - VMOVDQU 3488(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 3520(CX), Y9 - VMOVDQU 3552(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 7 to 8 outputs + // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (R12), Y11 - ADDQ $0x20, R12 + VMOVDQU 32(R12), Y13 + ADDQ $0x40, R12 VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 3584(CX), Y9 - VMOVDQU 3616(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 3648(CX), Y9 - VMOVDQU 3680(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 3712(CX), Y9 - VMOVDQU 3744(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 3776(CX), Y9 - VMOVDQU 3808(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 3840(CX), Y9 - VMOVDQU 3872(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 3904(CX), Y9 - VMOVDQU 3936(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 3968(CX), Y9 - VMOVDQU 4000(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 4032(CX), Y9 - VMOVDQU 4064(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 8 to 8 outputs - VMOVDQU (DX), Y11 - ADDQ $0x20, DX + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y13 + ADDQ $0x40, AX VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 4096(CX), Y9 - VMOVDQU 4128(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 4160(CX), Y9 - VMOVDQU 4192(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 - VMOVDQU 4224(CX), Y9 - VMOVDQU 4256(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 4288(CX), Y9 - VMOVDQU 4320(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 - VMOVDQU 4352(CX), Y9 - VMOVDQU 4384(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 4416(CX), Y9 - VMOVDQU 4448(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 - VMOVDQU 4480(CX), Y9 - VMOVDQU 4512(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 4544(CX), Y9 - VMOVDQU 4576(CX), Y10 - VPSHUFB Y11, Y9, Y9 - VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - // Store 8 outputs - MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) - MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) - MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) - MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) - MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) - MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) - MOVQ 144(R13), R15 - VMOVDQU Y6, (R15)(R14*1) - MOVQ 168(R13), R15 - VMOVDQU Y7, (R15)(R14*1) + // Store 3 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R15) + VMOVDQU Y3, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y4, (R13) + VMOVDQU Y5, 32(R13) + ADDQ $0x40, R13 // Prepare for next loop - ADDQ $0x20, R14 - DECQ AX - JNZ mulAvxTwo_9x8_loop + DECQ BP + JNZ mulAvxTwo_10x3_64Xor_loop VZEROUPPER -mulAvxTwo_9x8_end: +mulAvxTwo_10x3_64Xor_end: RET -// func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_9x9(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_10x4(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 176 YMM used + // Full registers estimated 89 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_9x9_end + JZ mulAvxTwo_10x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -28606,625 +62296,363 @@ TEXT ·mulAvxTwo_9x9(SB), NOSPLIT, $0-88 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ start+72(FP), R14 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X9 - VPBROADCASTB X9, Y9 - -mulAvxTwo_9x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 - // Load and process 32 bytes from input 0 to 9 outputs - VMOVDQU (BX), Y12 +mulAvxTwo_10x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 ADDQ $0x20, BX - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU (CX), Y10 - VMOVDQU 32(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 64(CX), Y10 - VMOVDQU 96(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 128(CX), Y10 - VMOVDQU 160(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 192(CX), Y10 - VMOVDQU 224(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 256(CX), Y10 - VMOVDQU 288(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 320(CX), Y10 - VMOVDQU 352(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 384(CX), Y10 - VMOVDQU 416(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 448(CX), Y10 - VMOVDQU 480(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 512(CX), Y10 - VMOVDQU 544(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 - // Load and process 32 bytes from input 1 to 9 outputs - VMOVDQU (SI), Y12 + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 ADDQ $0x20, SI - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 576(CX), Y10 - VMOVDQU 608(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 640(CX), Y10 - VMOVDQU 672(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 704(CX), Y10 - VMOVDQU 736(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 768(CX), Y10 - VMOVDQU 800(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 832(CX), Y10 - VMOVDQU 864(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 896(CX), Y10 - VMOVDQU 928(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 960(CX), Y10 - VMOVDQU 992(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 1024(CX), Y10 - VMOVDQU 1056(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 1088(CX), Y10 - VMOVDQU 1120(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 2 to 9 outputs - VMOVDQU (DI), Y12 + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 ADDQ $0x20, DI - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 1152(CX), Y10 - VMOVDQU 1184(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 1216(CX), Y10 - VMOVDQU 1248(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 1280(CX), Y10 - VMOVDQU 1312(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 1344(CX), Y10 - VMOVDQU 1376(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 1408(CX), Y10 - VMOVDQU 1440(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 1472(CX), Y10 - VMOVDQU 1504(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 1536(CX), Y10 - VMOVDQU 1568(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 1600(CX), Y10 - VMOVDQU 1632(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 1664(CX), Y10 - VMOVDQU 1696(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 3 to 9 outputs - VMOVDQU (R8), Y12 + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 ADDQ $0x20, R8 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 1728(CX), Y10 - VMOVDQU 1760(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 1792(CX), Y10 - VMOVDQU 1824(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 1856(CX), Y10 - VMOVDQU 1888(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 1920(CX), Y10 - VMOVDQU 1952(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 1984(CX), Y10 - VMOVDQU 2016(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 2048(CX), Y10 - VMOVDQU 2080(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 2112(CX), Y10 - VMOVDQU 2144(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 2176(CX), Y10 - VMOVDQU 2208(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 2240(CX), Y10 - VMOVDQU 2272(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 4 to 9 outputs - VMOVDQU (R9), Y12 + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 ADDQ $0x20, R9 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 2304(CX), Y10 - VMOVDQU 2336(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 2368(CX), Y10 - VMOVDQU 2400(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 2432(CX), Y10 - VMOVDQU 2464(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 2496(CX), Y10 - VMOVDQU 2528(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 2560(CX), Y10 - VMOVDQU 2592(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 2624(CX), Y10 - VMOVDQU 2656(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 2688(CX), Y10 - VMOVDQU 2720(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 2752(CX), Y10 - VMOVDQU 2784(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 2816(CX), Y10 - VMOVDQU 2848(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 - - // Load and process 32 bytes from input 5 to 9 outputs - VMOVDQU (R10), Y12 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 2880(CX), Y10 - VMOVDQU 2912(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 2944(CX), Y10 - VMOVDQU 2976(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 3008(CX), Y10 - VMOVDQU 3040(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 3072(CX), Y10 - VMOVDQU 3104(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 3136(CX), Y10 - VMOVDQU 3168(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 3200(CX), Y10 - VMOVDQU 3232(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 3264(CX), Y10 - VMOVDQU 3296(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 3328(CX), Y10 - VMOVDQU 3360(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 3392(CX), Y10 - VMOVDQU 3424(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 6 to 9 outputs - VMOVDQU (R11), Y12 + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y7 ADDQ $0x20, R11 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 3456(CX), Y10 - VMOVDQU 3488(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 3520(CX), Y10 - VMOVDQU 3552(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 3584(CX), Y10 - VMOVDQU 3616(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 3648(CX), Y10 - VMOVDQU 3680(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 3712(CX), Y10 - VMOVDQU 3744(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 3776(CX), Y10 - VMOVDQU 3808(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 3840(CX), Y10 - VMOVDQU 3872(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 3904(CX), Y10 - VMOVDQU 3936(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 3968(CX), Y10 - VMOVDQU 4000(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 7 to 9 outputs - VMOVDQU (R12), Y12 + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y7 ADDQ $0x20, R12 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 4032(CX), Y10 - VMOVDQU 4064(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 4096(CX), Y10 - VMOVDQU 4128(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 4160(CX), Y10 - VMOVDQU 4192(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 4224(CX), Y10 - VMOVDQU 4256(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 4288(CX), Y10 - VMOVDQU 4320(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 4352(CX), Y10 - VMOVDQU 4384(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 4416(CX), Y10 - VMOVDQU 4448(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 4480(CX), Y10 - VMOVDQU 4512(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 4544(CX), Y10 - VMOVDQU 4576(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 8 to 9 outputs - VMOVDQU (DX), Y12 + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y7 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2048(CX), Y5 + VMOVDQU 2080(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 2112(CX), Y5 + VMOVDQU 2144(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 2176(CX), Y5 + VMOVDQU 2208(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 2240(CX), Y5 + VMOVDQU 2272(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y7 ADDQ $0x20, DX - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 4608(CX), Y10 - VMOVDQU 4640(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 4672(CX), Y10 - VMOVDQU 4704(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 4736(CX), Y10 - VMOVDQU 4768(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 4800(CX), Y10 - VMOVDQU 4832(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 4864(CX), Y10 - VMOVDQU 4896(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 4928(CX), Y10 - VMOVDQU 4960(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 4992(CX), Y10 - VMOVDQU 5024(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 5056(CX), Y10 - VMOVDQU 5088(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 5120(CX), Y10 - VMOVDQU 5152(CX), Y11 - VPSHUFB Y12, Y10, Y10 - VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2304(CX), Y5 + VMOVDQU 2336(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 2368(CX), Y5 + VMOVDQU 2400(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 2432(CX), Y5 + VMOVDQU 2464(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 2496(CX), Y5 + VMOVDQU 2528(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Store 9 outputs - MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) - MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) - MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) - MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) - MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) - MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) - MOVQ 144(R13), R15 - VMOVDQU Y6, (R15)(R14*1) - MOVQ 168(R13), R15 - VMOVDQU Y7, (R15)(R14*1) - MOVQ 192(R13), R15 - VMOVDQU Y8, (R15)(R14*1) + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) // Prepare for next loop - ADDQ $0x20, R14 + ADDQ $0x20, R15 DECQ AX - JNZ mulAvxTwo_9x9_loop + JNZ mulAvxTwo_10x4_loop VZEROUPPER -mulAvxTwo_9x9_end: +mulAvxTwo_10x4_end: RET -// func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_9x10(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_10x4Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 195 YMM used + // Full registers estimated 89 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_9x10_end + JZ mulAvxTwo_10x4Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -29234,682 +62662,375 @@ TEXT ·mulAvxTwo_9x10(SB), NOSPLIT, $0-88 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ start+72(FP), R14 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X10 - VPBROADCASTB X10, Y10 - -mulAvxTwo_9x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 - // Load and process 32 bytes from input 0 to 10 outputs - VMOVDQU (BX), Y13 +mulAvxTwo_10x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 ADDQ $0x20, BX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU (CX), Y11 - VMOVDQU 32(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 64(CX), Y11 - VMOVDQU 96(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 128(CX), Y11 - VMOVDQU 160(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 192(CX), Y11 - VMOVDQU 224(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 256(CX), Y11 - VMOVDQU 288(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 320(CX), Y11 - VMOVDQU 352(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 384(CX), Y11 - VMOVDQU 416(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 448(CX), Y11 - VMOVDQU 480(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 512(CX), Y11 - VMOVDQU 544(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 576(CX), Y11 - VMOVDQU 608(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 1 to 10 outputs - VMOVDQU (SI), Y13 + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 ADDQ $0x20, SI - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 640(CX), Y11 - VMOVDQU 672(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 704(CX), Y11 - VMOVDQU 736(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 768(CX), Y11 - VMOVDQU 800(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 832(CX), Y11 - VMOVDQU 864(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 896(CX), Y11 - VMOVDQU 928(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 960(CX), Y11 - VMOVDQU 992(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 1024(CX), Y11 - VMOVDQU 1056(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 1088(CX), Y11 - VMOVDQU 1120(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 1152(CX), Y11 - VMOVDQU 1184(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 1216(CX), Y11 - VMOVDQU 1248(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 2 to 10 outputs - VMOVDQU (DI), Y13 + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 ADDQ $0x20, DI - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 1280(CX), Y11 - VMOVDQU 1312(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 1344(CX), Y11 - VMOVDQU 1376(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 1408(CX), Y11 - VMOVDQU 1440(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 1472(CX), Y11 - VMOVDQU 1504(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 1536(CX), Y11 - VMOVDQU 1568(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 1600(CX), Y11 - VMOVDQU 1632(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 1664(CX), Y11 - VMOVDQU 1696(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 1728(CX), Y11 - VMOVDQU 1760(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 1792(CX), Y11 - VMOVDQU 1824(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 1856(CX), Y11 - VMOVDQU 1888(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 3 to 10 outputs - VMOVDQU (R8), Y13 + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 ADDQ $0x20, R8 - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 1920(CX), Y11 - VMOVDQU 1952(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 1984(CX), Y11 - VMOVDQU 2016(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 2048(CX), Y11 - VMOVDQU 2080(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 2112(CX), Y11 - VMOVDQU 2144(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 2176(CX), Y11 - VMOVDQU 2208(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 2240(CX), Y11 - VMOVDQU 2272(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 2304(CX), Y11 - VMOVDQU 2336(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 2368(CX), Y11 - VMOVDQU 2400(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 2432(CX), Y11 - VMOVDQU 2464(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 2496(CX), Y11 - VMOVDQU 2528(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 4 to 10 outputs - VMOVDQU (R9), Y13 + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 ADDQ $0x20, R9 - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 2560(CX), Y11 - VMOVDQU 2592(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 2624(CX), Y11 - VMOVDQU 2656(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 2688(CX), Y11 - VMOVDQU 2720(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 2752(CX), Y11 - VMOVDQU 2784(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 2816(CX), Y11 - VMOVDQU 2848(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 2880(CX), Y11 - VMOVDQU 2912(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 2944(CX), Y11 - VMOVDQU 2976(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 3008(CX), Y11 - VMOVDQU 3040(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 3072(CX), Y11 - VMOVDQU 3104(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 3136(CX), Y11 - VMOVDQU 3168(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 - - // Load and process 32 bytes from input 5 to 10 outputs - VMOVDQU (R10), Y13 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 3200(CX), Y11 - VMOVDQU 3232(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 3264(CX), Y11 - VMOVDQU 3296(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 3328(CX), Y11 - VMOVDQU 3360(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 3392(CX), Y11 - VMOVDQU 3424(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 3456(CX), Y11 - VMOVDQU 3488(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 3520(CX), Y11 - VMOVDQU 3552(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 3584(CX), Y11 - VMOVDQU 3616(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 3648(CX), Y11 - VMOVDQU 3680(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 3712(CX), Y11 - VMOVDQU 3744(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 3776(CX), Y11 - VMOVDQU 3808(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 - - // Load and process 32 bytes from input 6 to 10 outputs - VMOVDQU (R11), Y13 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 3840(CX), Y11 - VMOVDQU 3872(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 3904(CX), Y11 - VMOVDQU 3936(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 3968(CX), Y11 - VMOVDQU 4000(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 4032(CX), Y11 - VMOVDQU 4064(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 4096(CX), Y11 - VMOVDQU 4128(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 4160(CX), Y11 - VMOVDQU 4192(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 4224(CX), Y11 - VMOVDQU 4256(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 4288(CX), Y11 - VMOVDQU 4320(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 4352(CX), Y11 - VMOVDQU 4384(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 4416(CX), Y11 - VMOVDQU 4448(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 7 to 10 outputs - VMOVDQU (R12), Y13 + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y7 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y7 ADDQ $0x20, R12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 4480(CX), Y11 - VMOVDQU 4512(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 4544(CX), Y11 - VMOVDQU 4576(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 4608(CX), Y11 - VMOVDQU 4640(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 4672(CX), Y11 - VMOVDQU 4704(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 4736(CX), Y11 - VMOVDQU 4768(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 4800(CX), Y11 - VMOVDQU 4832(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 4864(CX), Y11 - VMOVDQU 4896(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 4928(CX), Y11 - VMOVDQU 4960(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 4992(CX), Y11 - VMOVDQU 5024(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 5056(CX), Y11 - VMOVDQU 5088(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Load and process 32 bytes from input 8 to 10 outputs - VMOVDQU (DX), Y13 + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y7 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2048(CX), Y5 + VMOVDQU 2080(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 2112(CX), Y5 + VMOVDQU 2144(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 2176(CX), Y5 + VMOVDQU 2208(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 2240(CX), Y5 + VMOVDQU 2272(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y7 ADDQ $0x20, DX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU 5120(CX), Y11 - VMOVDQU 5152(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQU 5184(CX), Y11 - VMOVDQU 5216(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 - VMOVDQU 5248(CX), Y11 - VMOVDQU 5280(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 - VMOVDQU 5312(CX), Y11 - VMOVDQU 5344(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 - VMOVDQU 5376(CX), Y11 - VMOVDQU 5408(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 - VMOVDQU 5440(CX), Y11 - VMOVDQU 5472(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 - VMOVDQU 5504(CX), Y11 - VMOVDQU 5536(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 - VMOVDQU 5568(CX), Y11 - VMOVDQU 5600(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 - VMOVDQU 5632(CX), Y11 - VMOVDQU 5664(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 - VMOVDQU 5696(CX), Y11 - VMOVDQU 5728(CX), Y12 - VPSHUFB Y13, Y11, Y11 - VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2304(CX), Y5 + VMOVDQU 2336(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 2368(CX), Y5 + VMOVDQU 2400(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 2432(CX), Y5 + VMOVDQU 2464(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 2496(CX), Y5 + VMOVDQU 2528(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 - // Store 10 outputs - MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) - MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) - MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) - MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) - MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) - MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) - MOVQ 144(R13), R15 - VMOVDQU Y6, (R15)(R14*1) - MOVQ 168(R13), R15 - VMOVDQU Y7, (R15)(R14*1) - MOVQ 192(R13), R15 - VMOVDQU Y8, (R15)(R14*1) - MOVQ 216(R13), R15 - VMOVDQU Y9, (R15)(R14*1) + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) // Prepare for next loop - ADDQ $0x20, R14 + ADDQ $0x20, R15 DECQ AX - JNZ mulAvxTwo_9x10_loop + JNZ mulAvxTwo_10x4Xor_loop VZEROUPPER -mulAvxTwo_9x10_end: +mulAvxTwo_10x4Xor_end: RET -// func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_10x1(SB), NOSPLIT, $0-88 +TEXT ·mulAvxTwo_10x5(SB), NOSPLIT, $8-88 // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 24 YMM used + // Destination kept on stack + // Full registers estimated 110 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_10x1_end + JZ mulAvxTwo_10x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -29922,12 +63043,8 @@ TEXT ·mulAvxTwo_10x1(SB), NOSPLIT, $0-88 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 - MOVQ (R14), R14 MOVQ start+72(FP), R15 - // Add start offset to output - ADDQ R15, R14 - // Add start offset to input ADDQ R15, BX ADDQ R15, SI @@ -29939,439 +63056,408 @@ TEXT ·mulAvxTwo_10x1(SB), NOSPLIT, $0-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_10x1_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 +mulAvxTwo_10x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y8 ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (R12), Y4 + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y8 ADDQ $0x20, R12 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 8 to 1 outputs - VMOVDQU (R13), Y4 + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y8 ADDQ $0x20, R13 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 512(CX), Y2 - VMOVDQU 544(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2560(CX), Y6 + VMOVDQU 2592(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2624(CX), Y6 + VMOVDQU 2656(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2688(CX), Y6 + VMOVDQU 2720(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2752(CX), Y6 + VMOVDQU 2784(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2816(CX), Y6 + VMOVDQU 2848(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 9 to 1 outputs - VMOVDQU (DX), Y4 + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y8 ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 576(CX), Y2 - VMOVDQU 608(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 - - // Store 1 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_10x1_loop - VZEROUPPER - -mulAvxTwo_10x1_end: - RET - -// func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_10x1_64(SB), $8-88 - // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 24 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulAvxTwo_10x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), R12 - MOVQ 216(AX), AX - MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 - MOVQ start+72(FP), R14 - - // Add start offset to input - ADDQ R14, DX - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, AX - MOVQ $0x0000000f, R15 - MOVQ R15, X2 - VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R15 - SHRQ $0x06, R15 - -mulAvxTwo_10x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y6 - VMOVDQU 32(SI), Y5 - ADDQ $0x40, SI - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y6 - VMOVDQU 32(DI), Y5 - ADDQ $0x40, DI - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y6 - VMOVDQU 32(R8), Y5 - ADDQ $0x40, R8 - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 5 to 1 outputs - VMOVDQU (R9), Y6 - VMOVDQU 32(R9), Y5 - ADDQ $0x40, R9 - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 6 to 1 outputs - VMOVDQU (R10), Y6 - VMOVDQU 32(R10), Y5 - ADDQ $0x40, R10 - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 7 to 1 outputs - VMOVDQU (R11), Y6 - VMOVDQU 32(R11), Y5 - ADDQ $0x40, R11 - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 8 to 1 outputs - VMOVDQU (R12), Y6 - VMOVDQU 32(R12), Y5 - ADDQ $0x40, R12 - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 - - // Load and process 64 bytes from input 9 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2880(CX), Y6 + VMOVDQU 2912(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2944(CX), Y6 + VMOVDQU 2976(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 3008(CX), Y6 + VMOVDQU 3040(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 3072(CX), Y6 + VMOVDQU 3104(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 3136(CX), Y6 + VMOVDQU 3168(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Store 1 outputs - MOVQ (R13), BP - VMOVDQU Y0, (BP)(R14*1) - VMOVDQU Y1, 32(BP)(R14*1) + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) // Prepare for next loop - ADDQ $0x40, R14 - DECQ R15 - JNZ mulAvxTwo_10x1_64_loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x5_loop VZEROUPPER -mulAvxTwo_10x1_64_end: +mulAvxTwo_10x5_end: RET -// func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_10x2(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_10x5Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 47 YMM used + // Destination kept on stack + // Full registers estimated 110 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_10x2_end + JZ mulAvxTwo_10x5Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -30384,1777 +63470,1397 @@ TEXT ·mulAvxTwo_10x2(SB), NOSPLIT, $8-88 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 - MOVQ (R14), R15 - MOVQ 24(R14), R14 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R15 - ADDQ BP, R14 + MOVQ start+72(FP), R15 // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, R13 - ADDQ BP, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX MOVQ $0x0000000f, BP - MOVQ BP, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_10x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 + MOVQ BP, X5 + VPBROADCASTB X5, Y5 - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 +mulAvxTwo_10x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y8 ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (R12), Y5 + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y8 ADDQ $0x20, R12 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 8 to 2 outputs - VMOVDQU (R13), Y5 + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y8 ADDQ $0x20, R13 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1024(CX), Y3 - VMOVDQU 1056(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 1088(CX), Y3 - VMOVDQU 1120(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2560(CX), Y6 + VMOVDQU 2592(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2624(CX), Y6 + VMOVDQU 2656(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2688(CX), Y6 + VMOVDQU 2720(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2752(CX), Y6 + VMOVDQU 2784(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2816(CX), Y6 + VMOVDQU 2848(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Load and process 32 bytes from input 9 to 2 outputs - VMOVDQU (DX), Y5 + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y8 ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1152(CX), Y3 - VMOVDQU 1184(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 - VMOVDQU 1216(CX), Y3 - VMOVDQU 1248(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2880(CX), Y6 + VMOVDQU 2912(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2944(CX), Y6 + VMOVDQU 2976(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 3008(CX), Y6 + VMOVDQU 3040(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 3072(CX), Y6 + VMOVDQU 3104(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 3136(CX), Y6 + VMOVDQU 3168(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 - // Store 2 outputs - VMOVDQU Y0, (R15) - ADDQ $0x20, R15 - VMOVDQU Y1, (R14) - ADDQ $0x20, R14 + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) // Prepare for next loop + ADDQ $0x20, R15 DECQ AX - JNZ mulAvxTwo_10x2_loop + JNZ mulAvxTwo_10x5Xor_loop VZEROUPPER -mulAvxTwo_10x2_end: +mulAvxTwo_10x5Xor_end: RET -// func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_10x2_64(SB), $8-88 +TEXT ·mulAvxTwo_10x6(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 47 YMM used + // Full registers estimated 131 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX + SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_10x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), R12 - MOVQ 216(AX), AX - MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 - MOVQ start+72(FP), R14 + JZ mulAvxTwo_10x6_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 // Add start offset to input - ADDQ R14, DX - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, AX - MOVQ $0x0000000f, R15 - MOVQ R15, X4 - VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R15 - SHRQ $0x06, R15 - -mulAvxTwo_10x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX - VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU (CX), Y5 - VMOVDQU 32(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 - VMOVDQU 64(CX), Y5 - VMOVDQU 96(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 - // Load and process 64 bytes from input 1 to 2 outputs +mulAvxTwo_10x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX + ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 128(CX), Y5 - VMOVDQU 160(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 - VMOVDQU 192(CX), Y5 - VMOVDQU 224(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 - // Load and process 64 bytes from input 2 to 2 outputs + // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 - VMOVDQU 32(SI), Y11 - ADDQ $0x40, SI + ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 256(CX), Y5 - VMOVDQU 288(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 320(CX), Y5 - VMOVDQU 352(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 3 to 2 outputs + // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 - VMOVDQU 32(DI), Y11 - ADDQ $0x40, DI + ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 384(CX), Y5 - VMOVDQU 416(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 448(CX), Y5 - VMOVDQU 480(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 4 to 2 outputs + // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 - VMOVDQU 32(R8), Y11 - ADDQ $0x40, R8 + ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 512(CX), Y5 - VMOVDQU 544(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 576(CX), Y5 - VMOVDQU 608(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 5 to 2 outputs + // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 - VMOVDQU 32(R9), Y11 - ADDQ $0x40, R9 + ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 640(CX), Y5 - VMOVDQU 672(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 704(CX), Y5 - VMOVDQU 736(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 6 to 2 outputs + // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 - VMOVDQU 32(R10), Y11 - ADDQ $0x40, R10 + ADDQ $0x20, R10 VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 768(CX), Y5 - VMOVDQU 800(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 832(CX), Y5 - VMOVDQU 864(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 7 to 2 outputs + // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 - VMOVDQU 32(R11), Y11 - ADDQ $0x40, R11 + ADDQ $0x20, R11 VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 896(CX), Y5 - VMOVDQU 928(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 960(CX), Y5 - VMOVDQU 992(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 - - // Load and process 64 bytes from input 8 to 2 outputs - VMOVDQU (R12), Y9 - VMOVDQU 32(R12), Y11 - ADDQ $0x40, R12 - VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 1024(CX), Y5 - VMOVDQU 1056(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 - VMOVDQU 1088(CX), Y5 - VMOVDQU 1120(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 9 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y9 + ADDQ $0x20, R12 VPSRLQ $0x04, Y9, Y10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y4, Y9, Y9 - VPAND Y4, Y11, Y11 - VPAND Y4, Y10, Y10 - VPAND Y4, Y12, Y12 - VMOVDQU 1152(CX), Y5 - VMOVDQU 1184(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 - VMOVDQU 1216(CX), Y5 - VMOVDQU 1248(CX), Y6 - VPSHUFB Y11, Y5, Y7 - VPSHUFB Y9, Y5, Y5 - VPSHUFB Y12, Y6, Y8 - VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Store 2 outputs - MOVQ (R13), BP - VMOVDQU Y0, (BP)(R14*1) - VMOVDQU Y1, 32(BP)(R14*1) - MOVQ 24(R13), BP - VMOVDQU Y2, (BP)(R14*1) - VMOVDQU Y3, 32(BP)(R14*1) - - // Prepare for next loop - ADDQ $0x40, R14 - DECQ R15 - JNZ mulAvxTwo_10x2_64_loop - VZEROUPPER - -mulAvxTwo_10x2_64_end: - RET - -// func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_10x3(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 68 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x3_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), R12 - MOVQ 216(AX), AX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R13 - - // Add start offset to input - ADDQ BP, DX - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, AX - MOVQ $0x0000000f, BP - MOVQ BP, X3 - VPBROADCASTB X3, Y3 - MOVQ n+80(FP), BP - SHRQ $0x05, BP - -mulAvxTwo_10x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 8 to 3 outputs - VMOVDQU (R12), Y6 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1536(CX), Y4 - VMOVDQU 1568(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 1600(CX), Y4 - VMOVDQU 1632(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 1664(CX), Y4 - VMOVDQU 1696(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Load and process 32 bytes from input 9 to 3 outputs - VMOVDQU (AX), Y6 - ADDQ $0x20, AX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1728(CX), Y4 - VMOVDQU 1760(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 - VMOVDQU 1792(CX), Y4 - VMOVDQU 1824(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 - VMOVDQU 1856(CX), Y4 - VMOVDQU 1888(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 - - // Store 3 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - VMOVDQU Y1, (R15) - ADDQ $0x20, R15 - VMOVDQU Y2, (R13) + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y9 ADDQ $0x20, R13 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3072(CX), Y7 + VMOVDQU 3104(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 3136(CX), Y7 + VMOVDQU 3168(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 3200(CX), Y7 + VMOVDQU 3232(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 3264(CX), Y7 + VMOVDQU 3296(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 3328(CX), Y7 + VMOVDQU 3360(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3392(CX), Y7 + VMOVDQU 3424(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3456(CX), Y7 + VMOVDQU 3488(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 3520(CX), Y7 + VMOVDQU 3552(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 3584(CX), Y7 + VMOVDQU 3616(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 3648(CX), Y7 + VMOVDQU 3680(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 3712(CX), Y7 + VMOVDQU 3744(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3776(CX), Y7 + VMOVDQU 3808(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) // Prepare for next loop - DECQ BP - JNZ mulAvxTwo_10x3_loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x6_loop VZEROUPPER -mulAvxTwo_10x3_end: +mulAvxTwo_10x6_end: RET -// func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_10x3_64(SB), $8-88 +TEXT ·mulAvxTwo_10x6Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 68 YMM used + // Full registers estimated 131 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX + SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_10x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), R12 - MOVQ 216(AX), AX - MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 - MOVQ start+72(FP), R14 + JZ mulAvxTwo_10x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 // Add start offset to input - ADDQ R14, DX - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, AX - MOVQ $0x0000000f, R15 - MOVQ R15, X6 + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R15 - SHRQ $0x06, R15 - -mulAvxTwo_10x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 +mulAvxTwo_10x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y1, Y1 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 - - // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + VPXOR Y7, Y2, Y2 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y3, Y3 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y4, Y4 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y11 - VMOVDQU 32(SI), Y13 - ADDQ $0x40, SI - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 - - // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y11 - VMOVDQU 32(DI), Y13 - ADDQ $0x40, DI - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y11 - VMOVDQU 32(R8), Y13 - ADDQ $0x40, R8 - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 - - // Load and process 64 bytes from input 5 to 3 outputs - VMOVDQU (R9), Y11 - VMOVDQU 32(R9), Y13 - ADDQ $0x40, R9 - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 6 to 3 outputs - VMOVDQU (R10), Y11 - VMOVDQU 32(R10), Y13 - ADDQ $0x40, R10 - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 - - // Load and process 64 bytes from input 7 to 3 outputs - VMOVDQU (R11), Y11 - VMOVDQU 32(R11), Y13 - ADDQ $0x40, R11 - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y5, Y5 - // Load and process 64 bytes from input 8 to 3 outputs - VMOVDQU (R12), Y11 - VMOVDQU 32(R12), Y13 - ADDQ $0x40, R12 - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 - - // Load and process 64 bytes from input 9 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX - VPSRLQ $0x04, Y11, Y12 - VPSRLQ $0x04, Y13, Y14 - VPAND Y6, Y11, Y11 - VPAND Y6, Y13, Y13 - VPAND Y6, Y12, Y12 - VPAND Y6, Y14, Y14 + VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 - VPSHUFB Y13, Y7, Y9 - VPSHUFB Y11, Y7, Y7 - VPSHUFB Y14, Y8, Y10 - VPSHUFB Y12, Y8, Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 - - // Store 3 outputs - MOVQ (R13), BP - VMOVDQU Y0, (BP)(R14*1) - VMOVDQU Y1, 32(BP)(R14*1) - MOVQ 24(R13), BP - VMOVDQU Y2, (BP)(R14*1) - VMOVDQU Y3, 32(BP)(R14*1) - MOVQ 48(R13), BP - VMOVDQU Y4, (BP)(R14*1) - VMOVDQU Y5, 32(BP)(R14*1) - - // Prepare for next loop - ADDQ $0x40, R14 - DECQ R15 - JNZ mulAvxTwo_10x3_64_loop - VZEROUPPER - -mulAvxTwo_10x3_64_end: - RET - -// func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_10x4(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 89 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x4_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ out_base+48(FP), R14 - MOVQ start+72(FP), R15 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, R13 - ADDQ R15, DX - MOVQ $0x0000000f, BP - MOVQ BP, X4 - VPBROADCASTB X4, Y4 - -mulAvxTwo_10x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - - // Load and process 32 bytes from input 0 to 4 outputs - VMOVDQU (BX), Y7 - ADDQ $0x20, BX - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU (CX), Y5 - VMOVDQU 32(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 64(CX), Y5 - VMOVDQU 96(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 128(CX), Y5 - VMOVDQU 160(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 192(CX), Y5 - VMOVDQU 224(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 - - // Load and process 32 bytes from input 1 to 4 outputs - VMOVDQU (SI), Y7 - ADDQ $0x20, SI - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 256(CX), Y5 - VMOVDQU 288(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 320(CX), Y5 - VMOVDQU 352(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 384(CX), Y5 - VMOVDQU 416(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 448(CX), Y5 - VMOVDQU 480(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 - - // Load and process 32 bytes from input 2 to 4 outputs - VMOVDQU (DI), Y7 - ADDQ $0x20, DI - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 512(CX), Y5 - VMOVDQU 544(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 576(CX), Y5 - VMOVDQU 608(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 640(CX), Y5 - VMOVDQU 672(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 704(CX), Y5 - VMOVDQU 736(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 - - // Load and process 32 bytes from input 3 to 4 outputs - VMOVDQU (R8), Y7 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 768(CX), Y5 - VMOVDQU 800(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 832(CX), Y5 - VMOVDQU 864(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 896(CX), Y5 - VMOVDQU 928(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 960(CX), Y5 - VMOVDQU 992(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 - - // Load and process 32 bytes from input 4 to 4 outputs - VMOVDQU (R9), Y7 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1024(CX), Y5 - VMOVDQU 1056(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1088(CX), Y5 - VMOVDQU 1120(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1152(CX), Y5 - VMOVDQU 1184(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 1216(CX), Y5 - VMOVDQU 1248(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y7, Y5, Y5 - // Load and process 32 bytes from input 5 to 4 outputs - VMOVDQU (R10), Y7 + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y9 ADDQ $0x20, R10 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1280(CX), Y5 - VMOVDQU 1312(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1344(CX), Y5 - VMOVDQU 1376(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1408(CX), Y5 - VMOVDQU 1440(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 1472(CX), Y5 - VMOVDQU 1504(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 32 bytes from input 6 to 4 outputs - VMOVDQU (R11), Y7 + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y9 ADDQ $0x20, R11 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1536(CX), Y5 - VMOVDQU 1568(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1600(CX), Y5 - VMOVDQU 1632(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1664(CX), Y5 - VMOVDQU 1696(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 1728(CX), Y5 - VMOVDQU 1760(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 32 bytes from input 7 to 4 outputs - VMOVDQU (R12), Y7 + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y9 ADDQ $0x20, R12 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 1792(CX), Y5 - VMOVDQU 1824(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 1856(CX), Y5 - VMOVDQU 1888(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 1920(CX), Y5 - VMOVDQU 1952(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 1984(CX), Y5 - VMOVDQU 2016(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 32 bytes from input 8 to 4 outputs - VMOVDQU (R13), Y7 + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y9 ADDQ $0x20, R13 - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 2048(CX), Y5 - VMOVDQU 2080(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 2112(CX), Y5 - VMOVDQU 2144(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 2176(CX), Y5 - VMOVDQU 2208(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 2240(CX), Y5 - VMOVDQU 2272(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3072(CX), Y7 + VMOVDQU 3104(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 3136(CX), Y7 + VMOVDQU 3168(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 3200(CX), Y7 + VMOVDQU 3232(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 3264(CX), Y7 + VMOVDQU 3296(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 3328(CX), Y7 + VMOVDQU 3360(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3392(CX), Y7 + VMOVDQU 3424(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Load and process 32 bytes from input 9 to 4 outputs - VMOVDQU (DX), Y7 + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y9 ADDQ $0x20, DX - VPSRLQ $0x04, Y7, Y8 - VPAND Y4, Y7, Y7 - VPAND Y4, Y8, Y8 - VMOVDQU 2304(CX), Y5 - VMOVDQU 2336(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 - VMOVDQU 2368(CX), Y5 - VMOVDQU 2400(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 - VMOVDQU 2432(CX), Y5 - VMOVDQU 2464(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 - VMOVDQU 2496(CX), Y5 - VMOVDQU 2528(CX), Y6 - VPSHUFB Y7, Y5, Y5 - VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3456(CX), Y7 + VMOVDQU 3488(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 3520(CX), Y7 + VMOVDQU 3552(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 3584(CX), Y7 + VMOVDQU 3616(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 3648(CX), Y7 + VMOVDQU 3680(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 3712(CX), Y7 + VMOVDQU 3744(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3776(CX), Y7 + VMOVDQU 3808(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 - // Store 4 outputs + // Store 6 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP @@ -32163,27 +64869,31 @@ mulAvxTwo_10x4_loop: VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX - JNZ mulAvxTwo_10x4_loop + JNZ mulAvxTwo_10x6Xor_loop VZEROUPPER -mulAvxTwo_10x4_end: +mulAvxTwo_10x6Xor_end: RET -// func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_10x5(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_10x7(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 110 YMM used + // Full registers estimated 152 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_10x5_end + JZ mulAvxTwo_10x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -32210,388 +64920,494 @@ TEXT ·mulAvxTwo_10x5(SB), NOSPLIT, $8-88 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP - MOVQ BP, X5 - VPBROADCASTB X5, Y5 - -mulAvxTwo_10x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 + MOVQ BP, X7 + VPBROADCASTB X7, Y7 - // Load and process 32 bytes from input 0 to 5 outputs - VMOVDQU (BX), Y8 +mulAvxTwo_10x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 ADDQ $0x20, BX - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU (CX), Y6 - VMOVDQU 32(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 64(CX), Y6 - VMOVDQU 96(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 128(CX), Y6 - VMOVDQU 160(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 256(CX), Y6 - VMOVDQU 288(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 - // Load and process 32 bytes from input 1 to 5 outputs - VMOVDQU (SI), Y8 + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 ADDQ $0x20, SI - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 320(CX), Y6 - VMOVDQU 352(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 384(CX), Y6 - VMOVDQU 416(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 448(CX), Y6 - VMOVDQU 480(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 512(CX), Y6 - VMOVDQU 544(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 576(CX), Y6 - VMOVDQU 608(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 2 to 5 outputs - VMOVDQU (DI), Y8 + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 ADDQ $0x20, DI - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 640(CX), Y6 - VMOVDQU 672(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 704(CX), Y6 - VMOVDQU 736(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 768(CX), Y6 - VMOVDQU 800(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 832(CX), Y6 - VMOVDQU 864(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 896(CX), Y6 - VMOVDQU 928(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 3 to 5 outputs - VMOVDQU (R8), Y8 + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 ADDQ $0x20, R8 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 960(CX), Y6 - VMOVDQU 992(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1024(CX), Y6 - VMOVDQU 1056(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1088(CX), Y6 - VMOVDQU 1120(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1152(CX), Y6 - VMOVDQU 1184(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1216(CX), Y6 - VMOVDQU 1248(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 4 to 5 outputs - VMOVDQU (R9), Y8 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 1280(CX), Y6 - VMOVDQU 1312(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1344(CX), Y6 - VMOVDQU 1376(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1408(CX), Y6 - VMOVDQU 1440(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1472(CX), Y6 - VMOVDQU 1504(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1536(CX), Y6 - VMOVDQU 1568(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 5 to 5 outputs - VMOVDQU (R10), Y8 + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 ADDQ $0x20, R10 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 1600(CX), Y6 - VMOVDQU 1632(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1664(CX), Y6 - VMOVDQU 1696(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 1728(CX), Y6 - VMOVDQU 1760(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 1792(CX), Y6 - VMOVDQU 1824(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 1856(CX), Y6 - VMOVDQU 1888(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 6 to 5 outputs - VMOVDQU (R11), Y8 + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 ADDQ $0x20, R11 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 1920(CX), Y6 - VMOVDQU 1952(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 1984(CX), Y6 - VMOVDQU 2016(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 2048(CX), Y6 - VMOVDQU 2080(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 2112(CX), Y6 - VMOVDQU 2144(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 2176(CX), Y6 - VMOVDQU 2208(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 7 to 5 outputs - VMOVDQU (R12), Y8 + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y10 ADDQ $0x20, R12 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 2240(CX), Y6 - VMOVDQU 2272(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 2304(CX), Y6 - VMOVDQU 2336(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 2368(CX), Y6 - VMOVDQU 2400(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 2432(CX), Y6 - VMOVDQU 2464(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 2496(CX), Y6 - VMOVDQU 2528(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 8 to 5 outputs - VMOVDQU (R13), Y8 + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y10 ADDQ $0x20, R13 - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 2560(CX), Y6 - VMOVDQU 2592(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 2624(CX), Y6 - VMOVDQU 2656(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 2688(CX), Y6 - VMOVDQU 2720(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 2752(CX), Y6 - VMOVDQU 2784(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 2816(CX), Y6 - VMOVDQU 2848(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3584(CX), Y8 + VMOVDQU 3616(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3648(CX), Y8 + VMOVDQU 3680(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3712(CX), Y8 + VMOVDQU 3744(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3776(CX), Y8 + VMOVDQU 3808(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3840(CX), Y8 + VMOVDQU 3872(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3904(CX), Y8 + VMOVDQU 3936(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3968(CX), Y8 + VMOVDQU 4000(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 9 to 5 outputs - VMOVDQU (DX), Y8 + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y10 ADDQ $0x20, DX - VPSRLQ $0x04, Y8, Y9 - VPAND Y5, Y8, Y8 - VPAND Y5, Y9, Y9 - VMOVDQU 2880(CX), Y6 - VMOVDQU 2912(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 - VMOVDQU 2944(CX), Y6 - VMOVDQU 2976(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 - VMOVDQU 3008(CX), Y6 - VMOVDQU 3040(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 - VMOVDQU 3072(CX), Y6 - VMOVDQU 3104(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 - VMOVDQU 3136(CX), Y6 - VMOVDQU 3168(CX), Y7 - VPSHUFB Y8, Y6, Y6 - VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 4032(CX), Y8 + VMOVDQU 4064(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 4096(CX), Y8 + VMOVDQU 4128(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 4160(CX), Y8 + VMOVDQU 4192(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 4224(CX), Y8 + VMOVDQU 4256(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 4288(CX), Y8 + VMOVDQU 4320(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 4352(CX), Y8 + VMOVDQU 4384(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 4416(CX), Y8 + VMOVDQU 4448(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Store 5 outputs + // Store 7 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP @@ -32602,27 +65418,31 @@ mulAvxTwo_10x5_loop: VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX - JNZ mulAvxTwo_10x5_loop + JNZ mulAvxTwo_10x7_loop VZEROUPPER -mulAvxTwo_10x5_end: +mulAvxTwo_10x7_end: RET -// func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_10x6(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_10x7Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 131 YMM used + // Full registers estimated 152 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_10x6_end + JZ mulAvxTwo_10x7Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -32649,449 +65469,1123 @@ TEXT ·mulAvxTwo_10x6(SB), NOSPLIT, $8-88 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP - MOVQ BP, X6 - VPBROADCASTB X6, Y6 - -mulAvxTwo_10x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 + MOVQ BP, X7 + VPBROADCASTB X7, Y7 - // Load and process 32 bytes from input 0 to 6 outputs - VMOVDQU (BX), Y9 +mulAvxTwo_10x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 ADDQ $0x20, BX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU (CX), Y7 - VMOVDQU 32(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 64(CX), Y7 - VMOVDQU 96(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 128(CX), Y7 - VMOVDQU 160(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 192(CX), Y7 - VMOVDQU 224(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 256(CX), Y7 - VMOVDQU 288(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 320(CX), Y7 - VMOVDQU 352(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 1 to 6 outputs - VMOVDQU (SI), Y9 + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 ADDQ $0x20, SI - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 384(CX), Y7 - VMOVDQU 416(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 448(CX), Y7 - VMOVDQU 480(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 512(CX), Y7 - VMOVDQU 544(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 576(CX), Y7 - VMOVDQU 608(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 640(CX), Y7 - VMOVDQU 672(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 704(CX), Y7 - VMOVDQU 736(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 2 to 6 outputs - VMOVDQU (DI), Y9 + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 ADDQ $0x20, DI - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 768(CX), Y7 - VMOVDQU 800(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 832(CX), Y7 - VMOVDQU 864(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 896(CX), Y7 - VMOVDQU 928(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 960(CX), Y7 - VMOVDQU 992(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 1024(CX), Y7 - VMOVDQU 1056(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 1088(CX), Y7 - VMOVDQU 1120(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 3 to 6 outputs - VMOVDQU (R8), Y9 + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 ADDQ $0x20, R8 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 1152(CX), Y7 - VMOVDQU 1184(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 1216(CX), Y7 - VMOVDQU 1248(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 1280(CX), Y7 - VMOVDQU 1312(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 1344(CX), Y7 - VMOVDQU 1376(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 1408(CX), Y7 - VMOVDQU 1440(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 1472(CX), Y7 - VMOVDQU 1504(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 4 to 6 outputs - VMOVDQU (R9), Y9 + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 ADDQ $0x20, R9 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 1536(CX), Y7 - VMOVDQU 1568(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 1600(CX), Y7 - VMOVDQU 1632(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 1664(CX), Y7 - VMOVDQU 1696(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 1728(CX), Y7 - VMOVDQU 1760(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 1792(CX), Y7 - VMOVDQU 1824(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 1856(CX), Y7 - VMOVDQU 1888(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 5 to 6 outputs - VMOVDQU (R10), Y9 + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 ADDQ $0x20, R10 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 1920(CX), Y7 - VMOVDQU 1952(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y10 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y10 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3584(CX), Y8 + VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 1984(CX), Y7 - VMOVDQU 2016(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3648(CX), Y8 + VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 2048(CX), Y7 - VMOVDQU 2080(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3712(CX), Y8 + VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 2112(CX), Y7 - VMOVDQU 2144(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3776(CX), Y8 + VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 2176(CX), Y7 - VMOVDQU 2208(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3840(CX), Y8 + VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 2240(CX), Y7 - VMOVDQU 2272(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3904(CX), Y8 + VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3968(CX), Y8 + VMOVDQU 4000(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 - // Load and process 32 bytes from input 6 to 6 outputs - VMOVDQU (R11), Y9 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 2304(CX), Y7 - VMOVDQU 2336(CX), Y8 - VPSHUFB Y9, Y7, Y7 + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 4032(CX), Y8 + VMOVDQU 4064(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 2368(CX), Y7 - VMOVDQU 2400(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 4096(CX), Y8 + VMOVDQU 4128(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 2432(CX), Y7 - VMOVDQU 2464(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 4160(CX), Y8 + VMOVDQU 4192(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 2496(CX), Y7 - VMOVDQU 2528(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 4224(CX), Y8 + VMOVDQU 4256(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 2560(CX), Y7 - VMOVDQU 2592(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 4288(CX), Y8 + VMOVDQU 4320(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 2624(CX), Y7 - VMOVDQU 2656(CX), Y8 - VPSHUFB Y9, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 4352(CX), Y8 + VMOVDQU 4384(CX), Y9 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 4416(CX), Y8 + VMOVDQU 4448(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x7Xor_loop + VZEROUPPER + +mulAvxTwo_10x7Xor_end: + RET + +// func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x8(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x8_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_10x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 7 to 6 outputs - VMOVDQU (R12), Y9 + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y11 ADDQ $0x20, R12 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 2688(CX), Y7 - VMOVDQU 2720(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 2752(CX), Y7 - VMOVDQU 2784(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 2816(CX), Y7 - VMOVDQU 2848(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 2880(CX), Y7 - VMOVDQU 2912(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 2944(CX), Y7 - VMOVDQU 2976(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 3008(CX), Y7 - VMOVDQU 3040(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 8 to 6 outputs - VMOVDQU (R13), Y9 + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y11 ADDQ $0x20, R13 - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 3072(CX), Y7 - VMOVDQU 3104(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 3136(CX), Y7 - VMOVDQU 3168(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 3200(CX), Y7 - VMOVDQU 3232(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 3264(CX), Y7 - VMOVDQU 3296(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 3328(CX), Y7 - VMOVDQU 3360(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 3392(CX), Y7 - VMOVDQU 3424(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4096(CX), Y9 + VMOVDQU 4128(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 4160(CX), Y9 + VMOVDQU 4192(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 4224(CX), Y9 + VMOVDQU 4256(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 4288(CX), Y9 + VMOVDQU 4320(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 4352(CX), Y9 + VMOVDQU 4384(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 4416(CX), Y9 + VMOVDQU 4448(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 4480(CX), Y9 + VMOVDQU 4512(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4544(CX), Y9 + VMOVDQU 4576(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 9 to 6 outputs - VMOVDQU (DX), Y9 + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y11 ADDQ $0x20, DX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU 3456(CX), Y7 - VMOVDQU 3488(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 - VMOVDQU 3520(CX), Y7 - VMOVDQU 3552(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 - VMOVDQU 3584(CX), Y7 - VMOVDQU 3616(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 - VMOVDQU 3648(CX), Y7 - VMOVDQU 3680(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 - VMOVDQU 3712(CX), Y7 - VMOVDQU 3744(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VMOVDQU 3776(CX), Y7 - VMOVDQU 3808(CX), Y8 - VPSHUFB Y9, Y7, Y7 - VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4608(CX), Y9 + VMOVDQU 4640(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 4672(CX), Y9 + VMOVDQU 4704(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 4736(CX), Y9 + VMOVDQU 4768(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 4800(CX), Y9 + VMOVDQU 4832(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 4864(CX), Y9 + VMOVDQU 4896(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 4928(CX), Y9 + VMOVDQU 4960(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 4992(CX), Y9 + VMOVDQU 5024(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 5056(CX), Y9 + VMOVDQU 5088(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Store 6 outputs + // Store 8 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP @@ -33104,27 +66598,31 @@ mulAvxTwo_10x6_loop: VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y7, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX - JNZ mulAvxTwo_10x6_loop + JNZ mulAvxTwo_10x8_loop VZEROUPPER -mulAvxTwo_10x6_end: +mulAvxTwo_10x8_end: RET -// func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_10x7(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_10x8Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack - // Full registers estimated 152 YMM used + // Full registers estimated 173 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_10x7_end + JZ mulAvxTwo_10x8Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -33151,1136 +66649,1246 @@ TEXT ·mulAvxTwo_10x7(SB), NOSPLIT, $8-88 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP - MOVQ BP, X7 - VPBROADCASTB X7, Y7 - -mulAvxTwo_10x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - - // Load and process 32 bytes from input 0 to 7 outputs - VMOVDQU (BX), Y10 - ADDQ $0x20, BX - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU (CX), Y8 - VMOVDQU 32(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 64(CX), Y8 - VMOVDQU 96(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 128(CX), Y8 - VMOVDQU 160(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 192(CX), Y8 - VMOVDQU 224(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 320(CX), Y8 - VMOVDQU 352(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 384(CX), Y8 - VMOVDQU 416(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 - - // Load and process 32 bytes from input 1 to 7 outputs - VMOVDQU (SI), Y10 - ADDQ $0x20, SI - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 448(CX), Y8 - VMOVDQU 480(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 512(CX), Y8 - VMOVDQU 544(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 576(CX), Y8 - VMOVDQU 608(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 640(CX), Y8 - VMOVDQU 672(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 704(CX), Y8 - VMOVDQU 736(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 768(CX), Y8 - VMOVDQU 800(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 832(CX), Y8 - VMOVDQU 864(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 - - // Load and process 32 bytes from input 2 to 7 outputs - VMOVDQU (DI), Y10 - ADDQ $0x20, DI - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 896(CX), Y8 - VMOVDQU 928(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 960(CX), Y8 - VMOVDQU 992(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1024(CX), Y8 - VMOVDQU 1056(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1088(CX), Y8 - VMOVDQU 1120(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 1152(CX), Y8 - VMOVDQU 1184(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 1216(CX), Y8 - VMOVDQU 1248(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 1280(CX), Y8 - VMOVDQU 1312(CX), Y9 - VPSHUFB Y10, Y8, Y8 - VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + MOVQ BP, X8 + VPBROADCASTB X8, Y8 - // Load and process 32 bytes from input 3 to 7 outputs - VMOVDQU (R8), Y10 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 1344(CX), Y8 - VMOVDQU 1376(CX), Y9 - VPSHUFB Y10, Y8, Y8 +mulAvxTwo_10x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 1408(CX), Y8 - VMOVDQU 1440(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1472(CX), Y8 - VMOVDQU 1504(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1536(CX), Y8 - VMOVDQU 1568(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 1600(CX), Y8 - VMOVDQU 1632(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 1664(CX), Y8 - VMOVDQU 1696(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 1728(CX), Y8 - VMOVDQU 1760(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 4 to 7 outputs - VMOVDQU (R9), Y10 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 1792(CX), Y8 - VMOVDQU 1824(CX), Y9 - VPSHUFB Y10, Y8, Y8 + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 1856(CX), Y8 - VMOVDQU 1888(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 1920(CX), Y8 - VMOVDQU 1952(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 1984(CX), Y8 - VMOVDQU 2016(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 2048(CX), Y8 - VMOVDQU 2080(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 2112(CX), Y8 - VMOVDQU 2144(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 2176(CX), Y8 - VMOVDQU 2208(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 5 to 7 outputs - VMOVDQU (R10), Y10 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 2240(CX), Y8 - VMOVDQU 2272(CX), Y9 - VPSHUFB Y10, Y8, Y8 + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 2304(CX), Y8 - VMOVDQU 2336(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 2368(CX), Y8 - VMOVDQU 2400(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 2432(CX), Y8 - VMOVDQU 2464(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 2496(CX), Y8 - VMOVDQU 2528(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 2560(CX), Y8 - VMOVDQU 2592(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 2624(CX), Y8 - VMOVDQU 2656(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 6 to 7 outputs - VMOVDQU (R11), Y10 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 2688(CX), Y8 - VMOVDQU 2720(CX), Y9 - VPSHUFB Y10, Y8, Y8 + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 2752(CX), Y8 - VMOVDQU 2784(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 2816(CX), Y8 - VMOVDQU 2848(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 2880(CX), Y8 - VMOVDQU 2912(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 2944(CX), Y8 - VMOVDQU 2976(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 3008(CX), Y8 - VMOVDQU 3040(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 3072(CX), Y8 - VMOVDQU 3104(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 7 to 7 outputs - VMOVDQU (R12), Y10 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 3136(CX), Y8 - VMOVDQU 3168(CX), Y9 - VPSHUFB Y10, Y8, Y8 + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 3200(CX), Y8 - VMOVDQU 3232(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 3264(CX), Y8 - VMOVDQU 3296(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 3328(CX), Y8 - VMOVDQU 3360(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 3392(CX), Y8 - VMOVDQU 3424(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 3456(CX), Y8 - VMOVDQU 3488(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 3520(CX), Y8 - VMOVDQU 3552(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 8 to 7 outputs - VMOVDQU (R13), Y10 - ADDQ $0x20, R13 - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 3584(CX), Y8 - VMOVDQU 3616(CX), Y9 - VPSHUFB Y10, Y8, Y8 + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 3648(CX), Y8 - VMOVDQU 3680(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 3712(CX), Y8 - VMOVDQU 3744(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 3776(CX), Y8 - VMOVDQU 3808(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 3840(CX), Y8 - VMOVDQU 3872(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 3904(CX), Y8 - VMOVDQU 3936(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 3968(CX), Y8 - VMOVDQU 4000(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 9 to 7 outputs - VMOVDQU (DX), Y10 - ADDQ $0x20, DX - VPSRLQ $0x04, Y10, Y11 - VPAND Y7, Y10, Y10 - VPAND Y7, Y11, Y11 - VMOVDQU 4032(CX), Y8 - VMOVDQU 4064(CX), Y9 - VPSHUFB Y10, Y8, Y8 + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 - VMOVDQU 4096(CX), Y8 - VMOVDQU 4128(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 - VMOVDQU 4160(CX), Y8 - VMOVDQU 4192(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 - VMOVDQU 4224(CX), Y8 - VMOVDQU 4256(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 - VMOVDQU 4288(CX), Y8 - VMOVDQU 4320(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 - VMOVDQU 4352(CX), Y8 - VMOVDQU 4384(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 - VMOVDQU 4416(CX), Y8 - VMOVDQU 4448(CX), Y9 - VPSHUFB Y10, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 - - // Store 7 outputs - MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) - MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) - MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) - MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) - MOVQ 96(R14), BP - VMOVDQU Y4, (BP)(R15*1) - MOVQ 120(R14), BP - VMOVDQU Y5, (BP)(R15*1) - MOVQ 144(R14), BP - VMOVDQU Y6, (BP)(R15*1) - - // Prepare for next loop - ADDQ $0x20, R15 - DECQ AX - JNZ mulAvxTwo_10x7_loop - VZEROUPPER - -mulAvxTwo_10x7_end: - RET - -// func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_10x8(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 173 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x8_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ out_base+48(FP), R14 - MOVQ start+72(FP), R15 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, R13 - ADDQ R15, DX - MOVQ $0x0000000f, BP - MOVQ BP, X8 - VPBROADCASTB X8, Y8 - -mulAvxTwo_10x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 0 to 8 outputs - VMOVDQU (BX), Y11 - ADDQ $0x20, BX + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 - VMOVDQU (CX), Y9 - VMOVDQU 32(CX), Y10 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 - VMOVDQU 64(CX), Y9 - VMOVDQU 96(CX), Y10 + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 - VMOVDQU 128(CX), Y9 - VMOVDQU 160(CX), Y10 + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 - VMOVDQU 192(CX), Y9 - VMOVDQU 224(CX), Y10 + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 - VMOVDQU 256(CX), Y9 - VMOVDQU 288(CX), Y10 + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 - VMOVDQU 320(CX), Y9 - VMOVDQU 352(CX), Y10 + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 - VMOVDQU 384(CX), Y9 - VMOVDQU 416(CX), Y10 + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 - VMOVDQU 448(CX), Y9 - VMOVDQU 480(CX), Y10 + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 1 to 8 outputs - VMOVDQU (SI), Y11 - ADDQ $0x20, SI + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y11 + ADDQ $0x20, R13 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 - VMOVDQU 512(CX), Y9 - VMOVDQU 544(CX), Y10 + VMOVDQU 4096(CX), Y9 + VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 - VMOVDQU 576(CX), Y9 - VMOVDQU 608(CX), Y10 + VMOVDQU 4160(CX), Y9 + VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 - VMOVDQU 640(CX), Y9 - VMOVDQU 672(CX), Y10 + VMOVDQU 4224(CX), Y9 + VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 - VMOVDQU 704(CX), Y9 - VMOVDQU 736(CX), Y10 + VMOVDQU 4288(CX), Y9 + VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 - VMOVDQU 768(CX), Y9 - VMOVDQU 800(CX), Y10 + VMOVDQU 4352(CX), Y9 + VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 - VMOVDQU 832(CX), Y9 - VMOVDQU 864(CX), Y10 + VMOVDQU 4416(CX), Y9 + VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 - VMOVDQU 896(CX), Y9 - VMOVDQU 928(CX), Y10 + VMOVDQU 4480(CX), Y9 + VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 - VMOVDQU 960(CX), Y9 - VMOVDQU 992(CX), Y10 + VMOVDQU 4544(CX), Y9 + VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 - // Load and process 32 bytes from input 2 to 8 outputs - VMOVDQU (DI), Y11 - ADDQ $0x20, DI + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 - VMOVDQU 1024(CX), Y9 - VMOVDQU 1056(CX), Y10 + VMOVDQU 4608(CX), Y9 + VMOVDQU 4640(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 - VMOVDQU 1088(CX), Y9 - VMOVDQU 1120(CX), Y10 + VMOVDQU 4672(CX), Y9 + VMOVDQU 4704(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 - VMOVDQU 1152(CX), Y9 - VMOVDQU 1184(CX), Y10 + VMOVDQU 4736(CX), Y9 + VMOVDQU 4768(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 - VMOVDQU 1216(CX), Y9 - VMOVDQU 1248(CX), Y10 + VMOVDQU 4800(CX), Y9 + VMOVDQU 4832(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 - VMOVDQU 1280(CX), Y9 - VMOVDQU 1312(CX), Y10 + VMOVDQU 4864(CX), Y9 + VMOVDQU 4896(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 - VMOVDQU 1344(CX), Y9 - VMOVDQU 1376(CX), Y10 + VMOVDQU 4928(CX), Y9 + VMOVDQU 4960(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 - VMOVDQU 1408(CX), Y9 - VMOVDQU 1440(CX), Y10 + VMOVDQU 4992(CX), Y9 + VMOVDQU 5024(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 5056(CX), Y9 + VMOVDQU 5088(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 1472(CX), Y9 - VMOVDQU 1504(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y7, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x8Xor_loop + VZEROUPPER + +mulAvxTwo_10x8Xor_end: + RET + +// func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x9(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x9_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_10x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 3 to 8 outputs - VMOVDQU (R8), Y11 + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 ADDQ $0x20, R8 - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 1536(CX), Y9 - VMOVDQU 1568(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 1600(CX), Y9 - VMOVDQU 1632(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 1664(CX), Y9 - VMOVDQU 1696(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 1728(CX), Y9 - VMOVDQU 1760(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 1792(CX), Y9 - VMOVDQU 1824(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 1856(CX), Y9 - VMOVDQU 1888(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 1920(CX), Y9 - VMOVDQU 1952(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 1984(CX), Y9 - VMOVDQU 2016(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 4 to 8 outputs - VMOVDQU (R9), Y11 + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 ADDQ $0x20, R9 - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 2048(CX), Y9 - VMOVDQU 2080(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 2112(CX), Y9 - VMOVDQU 2144(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 2176(CX), Y9 - VMOVDQU 2208(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 2240(CX), Y9 - VMOVDQU 2272(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 2304(CX), Y9 - VMOVDQU 2336(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 2368(CX), Y9 - VMOVDQU 2400(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 2432(CX), Y9 - VMOVDQU 2464(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 2496(CX), Y9 - VMOVDQU 2528(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 5 to 8 outputs - VMOVDQU (R10), Y11 + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 ADDQ $0x20, R10 - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 2560(CX), Y9 - VMOVDQU 2592(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 2624(CX), Y9 - VMOVDQU 2656(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 2688(CX), Y9 - VMOVDQU 2720(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 2752(CX), Y9 - VMOVDQU 2784(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 2816(CX), Y9 - VMOVDQU 2848(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 2880(CX), Y9 - VMOVDQU 2912(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 2944(CX), Y9 - VMOVDQU 2976(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 3008(CX), Y9 - VMOVDQU 3040(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 6 to 8 outputs - VMOVDQU (R11), Y11 + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 ADDQ $0x20, R11 - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 3072(CX), Y9 - VMOVDQU 3104(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 3136(CX), Y9 - VMOVDQU 3168(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 3200(CX), Y9 - VMOVDQU 3232(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 3264(CX), Y9 - VMOVDQU 3296(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 3328(CX), Y9 - VMOVDQU 3360(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 3392(CX), Y9 - VMOVDQU 3424(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 3456(CX), Y9 - VMOVDQU 3488(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 3520(CX), Y9 - VMOVDQU 3552(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 7 to 8 outputs - VMOVDQU (R12), Y11 + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y12 ADDQ $0x20, R12 - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 3584(CX), Y9 - VMOVDQU 3616(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 3648(CX), Y9 - VMOVDQU 3680(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 3712(CX), Y9 - VMOVDQU 3744(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 3776(CX), Y9 - VMOVDQU 3808(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 3840(CX), Y9 - VMOVDQU 3872(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 3904(CX), Y9 - VMOVDQU 3936(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 3968(CX), Y9 - VMOVDQU 4000(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 4032(CX), Y9 - VMOVDQU 4064(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 - - // Load and process 32 bytes from input 8 to 8 outputs - VMOVDQU (R13), Y11 - ADDQ $0x20, R13 - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 4096(CX), Y9 - VMOVDQU 4128(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y12 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4608(CX), Y10 + VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 4160(CX), Y9 - VMOVDQU 4192(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 4672(CX), Y10 + VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 4224(CX), Y9 - VMOVDQU 4256(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 4736(CX), Y10 + VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 4288(CX), Y9 - VMOVDQU 4320(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 4800(CX), Y10 + VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 4352(CX), Y9 - VMOVDQU 4384(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 4864(CX), Y10 + VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 4416(CX), Y9 - VMOVDQU 4448(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 4928(CX), Y10 + VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 4480(CX), Y9 - VMOVDQU 4512(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 4992(CX), Y10 + VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 4544(CX), Y9 - VMOVDQU 4576(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 5056(CX), Y10 + VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 5120(CX), Y10 + VMOVDQU 5152(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 9 to 8 outputs - VMOVDQU (DX), Y11 + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y12 ADDQ $0x20, DX - VPSRLQ $0x04, Y11, Y12 - VPAND Y8, Y11, Y11 - VPAND Y8, Y12, Y12 - VMOVDQU 4608(CX), Y9 - VMOVDQU 4640(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 5184(CX), Y10 + VMOVDQU 5216(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 - VMOVDQU 4672(CX), Y9 - VMOVDQU 4704(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 5248(CX), Y10 + VMOVDQU 5280(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 - VMOVDQU 4736(CX), Y9 - VMOVDQU 4768(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 5312(CX), Y10 + VMOVDQU 5344(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 - VMOVDQU 4800(CX), Y9 - VMOVDQU 4832(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 5376(CX), Y10 + VMOVDQU 5408(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 - VMOVDQU 4864(CX), Y9 - VMOVDQU 4896(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 5440(CX), Y10 + VMOVDQU 5472(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 - VMOVDQU 4928(CX), Y9 - VMOVDQU 4960(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 5504(CX), Y10 + VMOVDQU 5536(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 - VMOVDQU 4992(CX), Y9 - VMOVDQU 5024(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 5568(CX), Y10 + VMOVDQU 5600(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 - VMOVDQU 5056(CX), Y9 - VMOVDQU 5088(CX), Y10 - VPSHUFB Y11, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 5632(CX), Y10 + VMOVDQU 5664(CX), Y11 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 5696(CX), Y10 + VMOVDQU 5728(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 - // Store 8 outputs + // Store 9 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP @@ -34297,19 +67905,21 @@ mulAvxTwo_10x8_loop: VMOVDQU Y6, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y7, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y8, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX - JNZ mulAvxTwo_10x8_loop + JNZ mulAvxTwo_10x9_loop VZEROUPPER -mulAvxTwo_10x8_end: +mulAvxTwo_10x9_end: RET -// func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_10x9(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_10x9Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 194 YMM used @@ -34317,7 +67927,7 @@ TEXT ·mulAvxTwo_10x9(SB), NOSPLIT, $8-88 MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_10x9_end + JZ mulAvxTwo_10x9Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -34347,72 +67957,79 @@ TEXT ·mulAvxTwo_10x9(SB), NOSPLIT, $8-88 MOVQ BP, X9 VPBROADCASTB X9, Y9 -mulAvxTwo_10x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - +mulAvxTwo_10x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 @@ -34542,434 +68159,1164 @@ mulAvxTwo_10x9_loop: VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 3 to 9 outputs - VMOVDQU (R8), Y12 - ADDQ $0x20, R8 + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y12 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y12 + ADDQ $0x20, R13 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 - VMOVDQU 1728(CX), Y10 - VMOVDQU 1760(CX), Y11 + VMOVDQU 4608(CX), Y10 + VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 - VMOVDQU 1792(CX), Y10 - VMOVDQU 1824(CX), Y11 + VMOVDQU 4672(CX), Y10 + VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 - VMOVDQU 1856(CX), Y10 - VMOVDQU 1888(CX), Y11 + VMOVDQU 4736(CX), Y10 + VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 - VMOVDQU 1920(CX), Y10 - VMOVDQU 1952(CX), Y11 + VMOVDQU 4800(CX), Y10 + VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 - VMOVDQU 1984(CX), Y10 - VMOVDQU 2016(CX), Y11 + VMOVDQU 4864(CX), Y10 + VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 - VMOVDQU 2048(CX), Y10 - VMOVDQU 2080(CX), Y11 + VMOVDQU 4928(CX), Y10 + VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 - VMOVDQU 2112(CX), Y10 - VMOVDQU 2144(CX), Y11 + VMOVDQU 4992(CX), Y10 + VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 - VMOVDQU 2176(CX), Y10 - VMOVDQU 2208(CX), Y11 + VMOVDQU 5056(CX), Y10 + VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 - VMOVDQU 2240(CX), Y10 - VMOVDQU 2272(CX), Y11 + VMOVDQU 5120(CX), Y10 + VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 4 to 9 outputs - VMOVDQU (R9), Y12 - ADDQ $0x20, R9 + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 - VMOVDQU 2304(CX), Y10 - VMOVDQU 2336(CX), Y11 + VMOVDQU 5184(CX), Y10 + VMOVDQU 5216(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 - VMOVDQU 2368(CX), Y10 - VMOVDQU 2400(CX), Y11 + VMOVDQU 5248(CX), Y10 + VMOVDQU 5280(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 - VMOVDQU 2432(CX), Y10 - VMOVDQU 2464(CX), Y11 + VMOVDQU 5312(CX), Y10 + VMOVDQU 5344(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 - VMOVDQU 2496(CX), Y10 - VMOVDQU 2528(CX), Y11 + VMOVDQU 5376(CX), Y10 + VMOVDQU 5408(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 - VMOVDQU 2560(CX), Y10 - VMOVDQU 2592(CX), Y11 + VMOVDQU 5440(CX), Y10 + VMOVDQU 5472(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 - VMOVDQU 2624(CX), Y10 - VMOVDQU 2656(CX), Y11 + VMOVDQU 5504(CX), Y10 + VMOVDQU 5536(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 - VMOVDQU 2688(CX), Y10 - VMOVDQU 2720(CX), Y11 + VMOVDQU 5568(CX), Y10 + VMOVDQU 5600(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 - VMOVDQU 2752(CX), Y10 - VMOVDQU 2784(CX), Y11 + VMOVDQU 5632(CX), Y10 + VMOVDQU 5664(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 - VMOVDQU 2816(CX), Y10 - VMOVDQU 2848(CX), Y11 + VMOVDQU 5696(CX), Y10 + VMOVDQU 5728(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 - // Load and process 32 bytes from input 5 to 9 outputs - VMOVDQU (R10), Y12 + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y8, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x9Xor_loop + VZEROUPPER + +mulAvxTwo_10x9Xor_end: + RET + +// func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x10(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_10x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 ADDQ $0x20, R10 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 2880(CX), Y10 - VMOVDQU 2912(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 2944(CX), Y10 - VMOVDQU 2976(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 3008(CX), Y10 - VMOVDQU 3040(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 3072(CX), Y10 - VMOVDQU 3104(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 3136(CX), Y10 - VMOVDQU 3168(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 3200(CX), Y10 - VMOVDQU 3232(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 3264(CX), Y10 - VMOVDQU 3296(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 3328(CX), Y10 - VMOVDQU 3360(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 3392(CX), Y10 - VMOVDQU 3424(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 6 to 9 outputs - VMOVDQU (R11), Y12 + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y13 ADDQ $0x20, R11 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 3456(CX), Y10 - VMOVDQU 3488(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 3520(CX), Y10 - VMOVDQU 3552(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 3584(CX), Y10 - VMOVDQU 3616(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 3648(CX), Y10 - VMOVDQU 3680(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 3712(CX), Y10 - VMOVDQU 3744(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 3776(CX), Y10 - VMOVDQU 3808(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 3840(CX), Y10 - VMOVDQU 3872(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 3904(CX), Y10 - VMOVDQU 3936(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 3968(CX), Y10 - VMOVDQU 4000(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 7 to 9 outputs - VMOVDQU (R12), Y12 + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y13 ADDQ $0x20, R12 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 4032(CX), Y10 - VMOVDQU 4064(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 4480(CX), Y11 + VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 4096(CX), Y10 - VMOVDQU 4128(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 4544(CX), Y11 + VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 4160(CX), Y10 - VMOVDQU 4192(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 4608(CX), Y11 + VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 4224(CX), Y10 - VMOVDQU 4256(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4672(CX), Y11 + VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 4288(CX), Y10 - VMOVDQU 4320(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4736(CX), Y11 + VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 4352(CX), Y10 - VMOVDQU 4384(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4800(CX), Y11 + VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 4416(CX), Y10 - VMOVDQU 4448(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4864(CX), Y11 + VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 4480(CX), Y10 - VMOVDQU 4512(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4928(CX), Y11 + VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 4544(CX), Y10 - VMOVDQU 4576(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4992(CX), Y11 + VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 5056(CX), Y11 + VMOVDQU 5088(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 8 to 9 outputs - VMOVDQU (R13), Y12 + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y13 ADDQ $0x20, R13 - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 4608(CX), Y10 - VMOVDQU 4640(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 5120(CX), Y11 + VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 4672(CX), Y10 - VMOVDQU 4704(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 5184(CX), Y11 + VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 4736(CX), Y10 - VMOVDQU 4768(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 5248(CX), Y11 + VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 4800(CX), Y10 - VMOVDQU 4832(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 5312(CX), Y11 + VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 4864(CX), Y10 - VMOVDQU 4896(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 5376(CX), Y11 + VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 4928(CX), Y10 - VMOVDQU 4960(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 5440(CX), Y11 + VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 4992(CX), Y10 - VMOVDQU 5024(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 5504(CX), Y11 + VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 5056(CX), Y10 - VMOVDQU 5088(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 5568(CX), Y11 + VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 5120(CX), Y10 - VMOVDQU 5152(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 5632(CX), Y11 + VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 5696(CX), Y11 + VMOVDQU 5728(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Load and process 32 bytes from input 9 to 9 outputs - VMOVDQU (DX), Y12 + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y13 ADDQ $0x20, DX - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU 5184(CX), Y10 - VMOVDQU 5216(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 5760(CX), Y11 + VMOVDQU 5792(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 - VMOVDQU 5248(CX), Y10 - VMOVDQU 5280(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 5824(CX), Y11 + VMOVDQU 5856(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 - VMOVDQU 5312(CX), Y10 - VMOVDQU 5344(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 5888(CX), Y11 + VMOVDQU 5920(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 - VMOVDQU 5376(CX), Y10 - VMOVDQU 5408(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 5952(CX), Y11 + VMOVDQU 5984(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 - VMOVDQU 5440(CX), Y10 - VMOVDQU 5472(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 6016(CX), Y11 + VMOVDQU 6048(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 - VMOVDQU 5504(CX), Y10 - VMOVDQU 5536(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 6080(CX), Y11 + VMOVDQU 6112(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 - VMOVDQU 5568(CX), Y10 - VMOVDQU 5600(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 6144(CX), Y11 + VMOVDQU 6176(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VMOVDQU 5632(CX), Y10 - VMOVDQU 5664(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 6208(CX), Y11 + VMOVDQU 6240(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VMOVDQU 5696(CX), Y10 - VMOVDQU 5728(CX), Y11 - VPSHUFB Y12, Y10, Y10 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 6272(CX), Y11 + VMOVDQU 6304(CX), Y12 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 6336(CX), Y11 + VMOVDQU 6368(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 - // Store 9 outputs + // Store 10 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP @@ -34988,19 +69335,21 @@ mulAvxTwo_10x9_loop: VMOVDQU Y7, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU Y8, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y9, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX - JNZ mulAvxTwo_10x9_loop + JNZ mulAvxTwo_10x10_loop VZEROUPPER -mulAvxTwo_10x9_end: +mulAvxTwo_10x10_end: RET -// func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_10x10(SB), NOSPLIT, $8-88 +TEXT ·mulAvxTwo_10x10Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 215 YMM used @@ -35008,7 +69357,7 @@ TEXT ·mulAvxTwo_10x10(SB), NOSPLIT, $8-88 MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX - JZ mulAvxTwo_10x10_end + JZ mulAvxTwo_10x10Xor_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI @@ -35038,79 +69387,87 @@ TEXT ·mulAvxTwo_10x10(SB), NOSPLIT, $8-88 MOVQ BP, X10 VPBROADCASTB X10, Y10 -mulAvxTwo_10x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 - +mulAvxTwo_10x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 + MOVQ 216(R14), BP + VMOVDQU (BP)(R15*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 @@ -35746,8 +70103,8 @@ mulAvxTwo_10x10_loop: // Prepare for next loop ADDQ $0x20, R15 DECQ AX - JNZ mulAvxTwo_10x10_loop + JNZ mulAvxTwo_10x10Xor_loop VZEROUPPER -mulAvxTwo_10x10_end: +mulAvxTwo_10x10Xor_end: RET diff --git a/galois_gen_none.go b/galois_gen_none.go index f11b6ee3..303d6a9f 100644 --- a/galois_gen_none.go +++ b/galois_gen_none.go @@ -3,10 +3,16 @@ package reedsolomon -const maxAvx2Inputs = 0 -const maxAvx2Outputs = 0 +const maxAvx2Inputs = 1 +const maxAvx2Outputs = 1 +const minAvx2Size = 1 +const avxSizeMask = 0 const avx2CodeGen = false func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic("avx2 codegen not available") } + +func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { + panic("avx2 codegen not available") +} diff --git a/galois_gen_switch_amd64.go b/galois_gen_switch_amd64.go index a6a84107..3078114b 100644 --- a/galois_gen_switch_amd64.go +++ b/galois_gen_switch_amd64.go @@ -5,29 +5,31 @@ package reedsolomon -import "fmt" +import ( + "fmt" +) -const avx2CodeGen = true -const maxAvx2Inputs = 10 -const maxAvx2Outputs = 10 +const ( + avx2CodeGen = true + maxAvx2Inputs = 10 + maxAvx2Outputs = 10 + minAvx2Size = 64 + avxSizeMask = maxInt - (minAvx2Size - 1) +) func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { - n := stop - start - n = (n >> 5) << 5 + n := (stop - start) & avxSizeMask switch len(in) { case 1: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_1x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_1x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_1x3_64(matrix, in, out, start, n) return n case 4: @@ -55,15 +57,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 2: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_2x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_2x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_2x3_64(matrix, in, out, start, n) return n case 4: @@ -91,15 +90,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 3: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_3x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_3x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_3x3_64(matrix, in, out, start, n) return n case 4: @@ -127,15 +123,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 4: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_4x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_4x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_4x3_64(matrix, in, out, start, n) return n case 4: @@ -163,15 +156,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 5: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_5x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_5x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_5x3_64(matrix, in, out, start, n) return n case 4: @@ -199,15 +189,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 6: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_6x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_6x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_6x3_64(matrix, in, out, start, n) return n case 4: @@ -235,15 +222,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 7: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_7x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_7x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_7x3_64(matrix, in, out, start, n) return n case 4: @@ -271,15 +255,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 8: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_8x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_8x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_8x3_64(matrix, in, out, start, n) return n case 4: @@ -307,15 +288,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 9: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_9x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_9x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_9x3_64(matrix, in, out, start, n) return n case 4: @@ -343,15 +321,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 10: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_10x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_10x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_10x3_64(matrix, in, out, start, n) return n case 4: @@ -379,3 +354,341 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } + +func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { + n := (stop - start) & avxSizeMask + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxTwo_1x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_1x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_1x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_1x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_1x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_1x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_1x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_1x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_1x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_1x10Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxTwo_2x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_2x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_2x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_2x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_2x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_2x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_2x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_2x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_2x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_2x10Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxTwo_3x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_3x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_3x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_3x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_3x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_3x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_3x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_3x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_3x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_3x10Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxTwo_4x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_4x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_4x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_4x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_4x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_4x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_4x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_4x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_4x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_4x10Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxTwo_5x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_5x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_5x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_5x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_5x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_5x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_5x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_5x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_5x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_5x10Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxTwo_6x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_6x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_6x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_6x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_6x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_6x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_6x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_6x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_6x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_6x10Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxTwo_7x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_7x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_7x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_7x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_7x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_7x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_7x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_7x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_7x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_7x10Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxTwo_8x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_8x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_8x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_8x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_8x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_8x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_8x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_8x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_8x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_8x10Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxTwo_9x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_9x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_9x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_9x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_9x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_9x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_9x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_9x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_9x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_9x10Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxTwo_10x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_10x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_10x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_10x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_10x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_10x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_10x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_10x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_10x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_10x10Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} diff --git a/galois_notamd64.go b/galois_notamd64.go index e9472f77..e67905b1 100644 --- a/galois_notamd64.go +++ b/galois_notamd64.go @@ -5,10 +5,10 @@ package reedsolomon -func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, byteCount int) { panic("codeSomeShardsAvx512 should not be called if built without asm") } -func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, byteCount int) { panic("codeSomeShardsAvx512P should not be called if built without asm") } diff --git a/reedsolomon.go b/reedsolomon.go index 3b1a6a97..8382e567 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -112,6 +112,9 @@ const ( avx2CodeGenMinSize = 64 avx2CodeGenMinShards = 3 avx2CodeGenMaxGoroutines = 8 + + intSize = 32 << (^uint(0) >> 63) // 32 or 64 + maxInt = 1<<(intSize-1) - 1 ) // reedSolomon contains a matrix for a specific @@ -291,6 +294,24 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { // Calculate what we want per round r.o.perRound = cpuid.CPU.Cache.L2 + + divide := parityShards + 1 + if avx2CodeGen && r.o.useAVX2 && (dataShards > maxAvx2Inputs || parityShards > maxAvx2Outputs) { + // Base on L1 cache if we have many inputs. + r.o.perRound = cpuid.CPU.Cache.L1D + divide = 0 + if dataShards > maxAvx2Inputs { + divide += maxAvx2Inputs + } else { + divide += dataShards + } + if parityShards > maxAvx2Inputs { + divide += maxAvx2Outputs + } else { + divide += parityShards + } + } + if r.o.perRound <= 0 { // Set to 128K if undetectable. r.o.perRound = 128 << 10 @@ -300,8 +321,9 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { // If multiple threads per core, make sure they don't contend for cache. r.o.perRound /= cpuid.CPU.ThreadsPerCore } + // 1 input + parity must fit in cache, and we add one more to be safer. - r.o.perRound = r.o.perRound / (1 + parityShards) + r.o.perRound = r.o.perRound / divide // Align to 64 bytes. r.o.perRound = ((r.o.perRound + 63) / 64) * 64 @@ -319,10 +341,6 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { } } - if r.o.perRound < r.o.minSplitSize { - r.o.perRound = r.o.minSplitSize - } - if r.o.shardSize > 0 { p := runtime.GOMAXPROCS(0) if p == 1 || r.o.shardSize <= r.o.minSplitSize*2 { @@ -347,7 +365,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { // Generated AVX2 does not need data to stay in L1 cache between runs. // We will be purely limited by RAM speed. - if r.canAVX2C(avx2CodeGenMinSize, r.DataShards, r.ParityShards) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines { + if r.canAVX2C(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines { r.o.maxGoroutines = avx2CodeGenMaxGoroutines } @@ -366,8 +384,9 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { } if avx2CodeGen && r.o.useAVX2 { + sz := r.DataShards * r.ParityShards * 2 * 32 r.mPool.New = func() interface{} { - return make([]byte, r.Shards*2*32) + return make([]byte, sz) } } return &r, err @@ -398,7 +417,7 @@ func (r *reedSolomon) Encode(shards [][]byte) error { output := shards[r.DataShards:] // Do the coding. - r.codeSomeShards(r.parity, shards[0:r.DataShards], output, r.ParityShards, len(shards[0])) + r.codeSomeShards(r.parity, shards[0:r.DataShards], output[:r.ParityShards], len(shards[0])) return nil } @@ -558,7 +577,7 @@ func (r *reedSolomon) Verify(shards [][]byte) (bool, error) { toCheck := shards[r.DataShards:] // Do the checking. - return r.checkSomeShards(r.parity, shards[0:r.DataShards], toCheck, r.ParityShards, len(shards[0])), nil + return r.checkSomeShards(r.parity, shards[:r.DataShards], toCheck[:r.ParityShards], len(shards[0])), nil } func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { @@ -576,19 +595,19 @@ func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { // The number of outputs computed, and the // number of matrix rows used, is determined by // outputCount, which is the number of outputs to compute. -func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteCount int) { if len(outputs) == 0 { return } switch { case r.o.useAVX512 && r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize && len(inputs) >= 4 && len(outputs) >= 2: - r.codeSomeShardsAvx512P(matrixRows, inputs, outputs, outputCount, byteCount) + r.codeSomeShardsAvx512P(matrixRows, inputs, outputs, byteCount) return case r.o.useAVX512 && len(inputs) >= 4 && len(outputs) >= 2: - r.codeSomeShardsAvx512(matrixRows, inputs, outputs, outputCount, byteCount) + r.codeSomeShardsAvx512(matrixRows, inputs, outputs, byteCount) return - case r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize: - r.codeSomeShardsP(matrixRows, inputs, outputs, outputCount, byteCount) + case byteCount > r.o.minSplitSize: + r.codeSomeShardsP(matrixRows, inputs, outputs, byteCount) return } @@ -598,16 +617,49 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outpu end = len(inputs[0]) } if r.canAVX2C(byteCount, len(inputs), len(outputs)) { - m := genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte)) + m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.mPool.Get().([]byte)) start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount) r.mPool.Put(m) end = len(inputs[0]) + } else if len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canAVX2C(byteCount, maxAvx2Inputs, maxAvx2Outputs) { + end = len(inputs[0]) + inIdx := 0 + m := r.mPool.Get().([]byte) + defer r.mPool.Put(m) + ins := inputs + for len(ins) > 0 { + inPer := ins + if len(inPer) > maxAvx2Inputs { + inPer = inPer[:maxAvx2Inputs] + } + outs := outputs + outIdx := 0 + for len(outs) > 0 { + outPer := outs + if len(outPer) > maxAvx2Outputs { + outPer = outPer[:maxAvx2Outputs] + } + m = genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) + if inIdx == 0 { + galMulSlicesAvx2(m, inPer, outPer, 0, byteCount) + } else { + galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount) + } + start = byteCount & avxSizeMask + outIdx += len(outPer) + outs = outs[len(outPer):] + } + inIdx += len(inPer) + ins = ins[len(inPer):] + } + if start >= end { + return + } } - for start < len(inputs[0]) { - for c := 0; c < r.DataShards; c++ { + for c := 0; c < len(inputs); c++ { in := inputs[c][start:end] - for iRow := 0; iRow < outputCount; iRow++ { + for iRow := 0; iRow < len(outputs); iRow++ { if c == 0 { galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:end], &r.o) } else { @@ -625,15 +677,21 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outpu // Perform the same as codeSomeShards, but split the workload into // several goroutines. -func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byteCount int) { var wg sync.WaitGroup gor := r.o.maxGoroutines var avx2Matrix []byte useAvx2 := r.canAVX2C(byteCount, len(inputs), len(outputs)) if useAvx2 { - avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte)) + avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.mPool.Get().([]byte)) defer r.mPool.Put(avx2Matrix) + } else if byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && + r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { + // It appears there is a switchover point at around 10MB where + // Regular processing is faster... + r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount) + return } do := byteCount / gor @@ -641,6 +699,40 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outp do = r.o.minSplitSize } + exec := func(start, stop int) { + if useAvx2 && stop-start >= 64 { + start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop) + } + + lstart, lstop := start, start+r.o.perRound + if lstop > stop { + lstop = stop + } + for lstart < stop { + for c := 0; c < len(inputs); c++ { + in := inputs[c][lstart:lstop] + for iRow := 0; iRow < len(outputs); iRow++ { + if c == 0 { + galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } else { + galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } + } + } + lstart = lstop + lstop += r.o.perRound + if lstop > stop { + lstop = stop + } + } + wg.Done() + } + if gor <= 1 { + wg.Add(1) + exec(0, byteCount) + return + } + // Make sizes divisible by 64 do = (do + 63) & (^63) start := 0 @@ -650,34 +742,162 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outp } wg.Add(1) - go func(start, stop int) { - if useAvx2 && stop-start >= 64 { - start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop) + go exec(start, start+do) + start += do + } + wg.Wait() +} + +// Perform the same as codeSomeShards, but split the workload into +// several goroutines. +func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int) { + var wg sync.WaitGroup + gor := r.o.maxGoroutines + + type state struct { + input [][]byte + output [][]byte + m []byte + first bool + } + // Make a plan... + plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs)) + + tmp := r.mPool.Get().([]byte) + defer func(b []byte) { + r.mPool.Put(b) + }(tmp) + + // Flips between input first to output first. + // We put the smallest data load in the inner loop. + if len(inputs) > len(outputs) { + inIdx := 0 + ins := inputs + for len(ins) > 0 { + inPer := ins + if len(inPer) > maxAvx2Inputs { + inPer = inPer[:maxAvx2Inputs] + } + outs := outputs + outIdx := 0 + for len(outs) > 0 { + outPer := outs + if len(outPer) > maxAvx2Outputs { + outPer = outPer[:maxAvx2Outputs] + } + // Generate local matrix + m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + tmp = tmp[len(m):] + plan = append(plan, state{ + input: inPer, + output: outPer, + m: m, + first: inIdx == 0, + }) + outIdx += len(outPer) + outs = outs[len(outPer):] + } + inIdx += len(inPer) + ins = ins[len(inPer):] + } + } else { + outs := outputs + outIdx := 0 + for len(outs) > 0 { + outPer := outs + if len(outPer) > maxAvx2Outputs { + outPer = outPer[:maxAvx2Outputs] } - lstart, lstop := start, start+r.o.perRound - if lstop > stop { - lstop = stop + inIdx := 0 + ins := inputs + for len(ins) > 0 { + inPer := ins + if len(inPer) > maxAvx2Inputs { + inPer = inPer[:maxAvx2Inputs] + } + // Generate local matrix + m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + tmp = tmp[len(m):] + //fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound) + plan = append(plan, state{ + input: inPer, + output: outPer, + m: m, + first: inIdx == 0, + }) + inIdx += len(inPer) + ins = ins[len(inPer):] } - for lstart < stop { - for c := 0; c < r.DataShards; c++ { - in := inputs[c][lstart:lstop] - for iRow := 0; iRow < outputCount; iRow++ { - if c == 0 { - galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) - } else { - galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) - } + outIdx += len(outPer) + outs = outs[len(outPer):] + } + } + + do := byteCount / gor + if do < r.o.minSplitSize { + do = r.o.minSplitSize + } + + exec := func(start, stop int) { + lstart, lstop := start, start+r.o.perRound + if lstop > stop { + lstop = stop + } + for lstart < stop { + if lstop-lstart >= minAvx2Size { + // Execute plan... + for _, p := range plan { + if p.first { + galMulSlicesAvx2(p.m, p.input, p.output, lstart, lstop) + } else { + galMulSlicesAvx2Xor(p.m, p.input, p.output, lstart, lstop) } } - lstart = lstop - lstop += r.o.perRound - if lstop > stop { - lstop = stop + lstart += (lstop - lstart) & avxSizeMask + if lstart == lstop { + lstop += r.o.perRound + if lstop > stop { + lstop = stop + } + continue } } - wg.Done() - }(start, start+do) + + for c := range inputs { + in := inputs[c][lstart:lstop] + for iRow := 0; iRow < len(outputs); iRow++ { + if c == 0 { + galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } else { + galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } + } + } + lstart = lstop + lstop += r.o.perRound + if lstop > stop { + lstop = stop + } + } + wg.Done() + } + if gor == 1 { + wg.Add(1) + exec(0, byteCount) + return + } + + // Make sizes divisible by 64 + do = (do + 63) & (^63) + start := 0 + for start < byteCount { + if start+do > byteCount { + do = byteCount - start + } + + wg.Add(1) + go exec(start, start+do) start += do } wg.Wait() @@ -686,7 +906,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outp // checkSomeShards is mostly the same as codeSomeShards, // except this will check values and return // as soon as a difference is found. -func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool { +func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, byteCount int) bool { if len(toCheck) == 0 { return true } @@ -695,7 +915,7 @@ func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outp for i := range outputs { outputs[i] = make([]byte, byteCount) } - r.codeSomeShards(matrixRows, inputs, outputs, outputCount, byteCount) + r.codeSomeShards(matrixRows, inputs, outputs, byteCount) for i, calc := range outputs { if !bytes.Equal(calc, toCheck[i]) { @@ -902,7 +1122,7 @@ func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error { outputCount++ } } - r.codeSomeShards(matrixRows, subShards, outputs[:outputCount], outputCount, shardSize) + r.codeSomeShards(matrixRows, subShards, outputs[:outputCount], shardSize) if dataOnly { // Exit out early if we are only interested in the data shards @@ -928,7 +1148,7 @@ func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error { outputCount++ } } - r.codeSomeShards(matrixRows, shards[:r.DataShards], outputs[:outputCount], outputCount, shardSize) + r.codeSomeShards(matrixRows, shards[:r.DataShards], outputs[:outputCount], shardSize) return nil } diff --git a/reedsolomon_test.go b/reedsolomon_test.go index ce5d1765..342cf6b9 100644 --- a/reedsolomon_test.go +++ b/reedsolomon_test.go @@ -191,7 +191,7 @@ func TestEncoding(t *testing.T) { // note that par1 matric will fail on some combinations. var testSizes = [][2]int{ {1, 0}, {3, 0}, {5, 0}, {8, 0}, {10, 0}, {12, 0}, {14, 0}, {41, 0}, {49, 0}, - {1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}} + {1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}, {5, 20}} var testDataSizes = []int{10, 100, 1000, 10001, 100003, 1000055} var testDataSizesShort = []int{10, 10001, 100003} @@ -893,6 +893,7 @@ func benchmarkEncode(b *testing.B, dataShards, parityShards, shardSize int) { b.SetBytes(int64(shardSize * (dataShards + parityShards))) b.ResetTimer() + b.ReportAllocs() for i := 0; i < b.N; i++ { err = r.Encode(shards) if err != nil { @@ -937,7 +938,7 @@ func BenchmarkEncode10x4x1M(b *testing.B) { benchmarkEncode(b, 10, 4, 1024*1024) } -// Benchmark 50 data shards and 20 parity shards with 1MB each. +// Benchmark 50 data shards and 20 parity shards with 1M each. func BenchmarkEncode50x20x1M(b *testing.B) { benchmarkEncode(b, 50, 20, 1024*1024) } @@ -989,6 +990,7 @@ func benchmarkVerify(b *testing.B, dataShards, parityShards, shardSize int) { b.SetBytes(int64(shardSize * (dataShards + parityShards))) b.ResetTimer() + b.ReportAllocs() for i := 0; i < b.N; i++ { _, err = r.Verify(shards) if err != nil { @@ -1003,7 +1005,7 @@ func BenchmarkVerify10x2x10000(b *testing.B) { } // Benchmark 50 data slices with 5 parity slices holding 100000 bytes each -func BenchmarkVerify50x5x50000(b *testing.B) { +func BenchmarkVerify50x5x100000(b *testing.B) { benchmarkVerify(b, 50, 5, 100000) } @@ -1359,11 +1361,11 @@ func TestCodeSomeShards(t *testing.T) { shards, _ := enc.Split(data) old := runtime.GOMAXPROCS(1) - r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0])) + r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:r.DataShards+r.ParityShards], len(shards[0])) // hopefully more than 1 CPU runtime.GOMAXPROCS(runtime.NumCPU()) - r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0])) + r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:r.DataShards+r.ParityShards], len(shards[0])) // reset MAXPROCS, otherwise testing complains runtime.GOMAXPROCS(old) @@ -1642,7 +1644,9 @@ func benchmarkParallel(b *testing.B, dataShards, parityShards, shardSize int) { c := runtime.GOMAXPROCS(0) // Note that concurrency also affects total data size and will make caches less effective. - b.Log("Total data:", (c*dataShards*shardSize)>>20, "MiB", "parity:", (c*parityShards*shardSize)>>20, "MiB") + if testing.Verbose() { + b.Log("Total data:", (c*dataShards*shardSize)>>20, "MiB", "parity:", (c*parityShards*shardSize)>>20, "MiB") + } // Create independent shards shardsCh := make(chan [][]byte, c) for i := 0; i < c; i++ {